minne/eval/src/args.rs

use std::{
    env,
    path::{Path, PathBuf},
};

use anyhow::{anyhow, Context, Result};

use crate::datasets::DatasetKind;

pub const DEFAULT_SLICE_SEED: u64 = 0x5eed_2025;

#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum EmbeddingBackend {
    Hashed,
    FastEmbed,
}

impl Default for EmbeddingBackend {
    fn default() -> Self {
        Self::FastEmbed
    }
}

impl std::str::FromStr for EmbeddingBackend {
    type Err = anyhow::Error;

    fn from_str(s: &str) -> Result<Self, Self::Err> {
        match s.to_ascii_lowercase().as_str() {
            "hashed" => Ok(Self::Hashed),
            "fastembed" | "fast-embed" | "fast" => Ok(Self::FastEmbed),
            other => Err(anyhow!(
                "unknown embedding backend '{other}'. Expected 'hashed' or 'fastembed'."
            )),
        }
    }
}

#[derive(Debug, Clone)]
pub struct Config {
    pub convert_only: bool,
    pub force_convert: bool,
    pub dataset: DatasetKind,
    pub llm_mode: bool,
    pub corpus_limit: Option<usize>,
    pub raw_dataset_path: PathBuf,
    pub converted_dataset_path: PathBuf,
    pub report_dir: PathBuf,
    pub k: usize,
    pub limit: Option<usize>,
    pub summary_sample: usize,
    pub full_context: bool,
    pub chunk_min_chars: usize,
    pub chunk_max_chars: usize,
    pub chunk_vector_take: Option<usize>,
    pub chunk_fts_take: Option<usize>,
    pub chunk_token_budget: Option<usize>,
    pub chunk_avg_chars_per_token: Option<usize>,
    pub max_chunks_per_entity: Option<usize>,
    pub rerank: bool,
    pub rerank_pool_size: usize,
    pub rerank_keep_top: usize,
    pub concurrency: usize,
    pub embedding_backend: EmbeddingBackend,
    pub embedding_model: Option<String>,
    pub cache_dir: PathBuf,
    pub ingestion_cache_dir: PathBuf,
    pub refresh_embeddings_only: bool,
    pub detailed_report: bool,
    pub slice: Option<String>,
    pub reseed_slice: bool,
    pub slice_seed: u64,
    pub slice_grow: Option<usize>,
    pub slice_offset: usize,
    pub slice_reset_ingestion: bool,
    pub negative_multiplier: f32,
    pub label: Option<String>,
    pub chunk_diagnostics_path: Option<PathBuf>,
    pub inspect_question: Option<String>,
    pub inspect_manifest: Option<PathBuf>,
    pub query_model: Option<String>,
    pub perf_log_json: Option<PathBuf>,
    pub perf_log_dir: Option<PathBuf>,
    pub perf_log_console: bool,
    pub db_endpoint: String,
    pub db_username: String,
    pub db_password: String,
    pub db_namespace: Option<String>,
    pub db_database: Option<String>,
    pub inspect_db_state: Option<PathBuf>,
}

impl Default for Config {
    fn default() -> Self {
        let dataset = DatasetKind::default();
        Self {
            convert_only: false,
            force_convert: false,
            dataset,
            llm_mode: false,
            corpus_limit: None,
            raw_dataset_path: dataset.default_raw_path(),
            converted_dataset_path: dataset.default_converted_path(),
            report_dir: PathBuf::from("eval/reports"),
            k: 5,
            limit: Some(200),
            summary_sample: 5,
            full_context: false,
            chunk_min_chars: 500,
            chunk_max_chars: 2_000,
            chunk_vector_take: None,
            chunk_fts_take: None,
            chunk_token_budget: None,
            chunk_avg_chars_per_token: None,
            max_chunks_per_entity: None,
            rerank: true,
            rerank_pool_size: 16,
            rerank_keep_top: 10,
            concurrency: 4,
            embedding_backend: EmbeddingBackend::FastEmbed,
            embedding_model: None,
            cache_dir: PathBuf::from("eval/cache"),
            ingestion_cache_dir: PathBuf::from("eval/cache/ingested"),
            refresh_embeddings_only: false,
            detailed_report: false,
            slice: None,
            reseed_slice: false,
            slice_seed: DEFAULT_SLICE_SEED,
            slice_grow: None,
            slice_offset: 0,
            slice_reset_ingestion: false,
            negative_multiplier: crate::slices::DEFAULT_NEGATIVE_MULTIPLIER,
            label: None,
            chunk_diagnostics_path: None,
            inspect_question: None,
            inspect_manifest: None,
            query_model: None,
            inspect_db_state: None,
            perf_log_json: None,
            perf_log_dir: None,
            perf_log_console: false,
            db_endpoint: "ws://127.0.0.1:8000".to_string(),
            db_username: "root_user".to_string(),
            db_password: "root_password".to_string(),
            db_namespace: None,
            db_database: None,
        }
    }
}

impl Config {
    pub fn context_token_limit(&self) -> Option<usize> {
        None
    }
}

#[derive(Debug)]
pub struct ParsedArgs {
    pub config: Config,
    pub show_help: bool,
}

pub fn parse() -> Result<ParsedArgs> {
    let mut config = Config::default();
    let mut show_help = false;
    let mut raw_overridden = false;
    let mut converted_overridden = false;

    let mut args = env::args().skip(1).peekable();
    while let Some(arg) = args.next() {
        match arg.as_str() {
            "-h" | "--help" => {
                show_help = true;
                break;
            }
            "--convert-only" => config.convert_only = true,
            "--force" | "--refresh" => config.force_convert = true,
            "--llm-mode" => {
                config.llm_mode = true;
            }
            "--dataset" => {
                let value = take_value("--dataset", &mut args)?;
                let parsed = value.parse::<DatasetKind>()?;
                config.dataset = parsed;
                if !raw_overridden {
                    config.raw_dataset_path = parsed.default_raw_path();
                }
                if !converted_overridden {
                    config.converted_dataset_path = parsed.default_converted_path();
                }
            }
            "--slice" => {
                let value = take_value("--slice", &mut args)?;
                config.slice = Some(value);
            }
            "--label" => {
                let value = take_value("--label", &mut args)?;
                config.label = Some(value);
            }
            "--query-model" => {
                let value = take_value("--query-model", &mut args)?;
                if value.trim().is_empty() {
                    return Err(anyhow!("--query-model requires a non-empty model name"));
                }
                config.query_model = Some(value.trim().to_string());
            }
            "--slice-grow" => {
                let value = take_value("--slice-grow", &mut args)?;
                let parsed = value.parse::<usize>().with_context(|| {
                    format!("failed to parse --slice-grow value '{value}' as usize")
                })?;
                if parsed == 0 {
                    return Err(anyhow!("--slice-grow must be greater than zero"));
                }
                config.slice_grow = Some(parsed);
            }
            "--slice-offset" => {
                let value = take_value("--slice-offset", &mut args)?;
                let parsed = value.parse::<usize>().with_context(|| {
                    format!("failed to parse --slice-offset value '{value}' as usize")
                })?;
                config.slice_offset = parsed;
            }
            "--raw" => {
                let value = take_value("--raw", &mut args)?;
                config.raw_dataset_path = PathBuf::from(value);
                raw_overridden = true;
            }
            "--converted" => {
                let value = take_value("--converted", &mut args)?;
                config.converted_dataset_path = PathBuf::from(value);
                converted_overridden = true;
            }
            "--corpus-limit" => {
                let value = take_value("--corpus-limit", &mut args)?;
                let parsed = value.parse::<usize>().with_context(|| {
                    format!("failed to parse --corpus-limit value '{value}' as usize")
                })?;
                config.corpus_limit = if parsed == 0 { None } else { Some(parsed) };
            }
            "--reseed-slice" => {
                config.reseed_slice = true;
            }
            "--slice-reset-ingestion" => {
                config.slice_reset_ingestion = true;
            }
            "--report-dir" => {
                let value = take_value("--report-dir", &mut args)?;
                config.report_dir = PathBuf::from(value);
            }
            "--k" => {
                let value = take_value("--k", &mut args)?;
                let parsed = value
                    .parse::<usize>()
                    .with_context(|| format!("failed to parse --k value '{value}' as usize"))?;
                if parsed == 0 {
                    return Err(anyhow!("--k must be greater than zero"));
                }
                config.k = parsed;
            }
            "--limit" => {
                let value = take_value("--limit", &mut args)?;
                let parsed = value
                    .parse::<usize>()
                    .with_context(|| format!("failed to parse --limit value '{value}' as usize"))?;
                config.limit = if parsed == 0 { None } else { Some(parsed) };
            }
            "--sample" => {
                let value = take_value("--sample", &mut args)?;
                let parsed = value.parse::<usize>().with_context(|| {
                    format!("failed to parse --sample value '{value}' as usize")
                })?;
                config.summary_sample = parsed.max(1);
            }
            "--full-context" => {
                config.full_context = true;
            }
            "--chunk-min" => {
                let value = take_value("--chunk-min", &mut args)?;
                let parsed = value.parse::<usize>().with_context(|| {
                    format!("failed to parse --chunk-min value '{value}' as usize")
                })?;
                config.chunk_min_chars = parsed.max(1);
            }
            "--chunk-max" => {
                let value = take_value("--chunk-max", &mut args)?;
                let parsed = value.parse::<usize>().with_context(|| {
                    format!("failed to parse --chunk-max value '{value}' as usize")
                })?;
                config.chunk_max_chars = parsed.max(1);
            }
            "--chunk-vector-take" => {
                let value = take_value("--chunk-vector-take", &mut args)?;
                let parsed = value.parse::<usize>().with_context(|| {
                    format!("failed to parse --chunk-vector-take value '{value}' as usize")
                })?;
                if parsed == 0 {
                    return Err(anyhow!("--chunk-vector-take must be greater than zero"));
                }
                config.chunk_vector_take = Some(parsed);
            }
            "--chunk-fts-take" => {
                let value = take_value("--chunk-fts-take", &mut args)?;
                let parsed = value.parse::<usize>().with_context(|| {
                    format!("failed to parse --chunk-fts-take value '{value}' as usize")
                })?;
                if parsed == 0 {
                    return Err(anyhow!("--chunk-fts-take must be greater than zero"));
                }
                config.chunk_fts_take = Some(parsed);
            }
            "--chunk-token-budget" => {
                let value = take_value("--chunk-token-budget", &mut args)?;
                let parsed = value.parse::<usize>().with_context(|| {
                    format!("failed to parse --chunk-token-budget value '{value}' as usize")
                })?;
                if parsed == 0 {
                    return Err(anyhow!("--chunk-token-budget must be greater than zero"));
                }
                config.chunk_token_budget = Some(parsed);
            }
            "--chunk-token-chars" => {
                let value = take_value("--chunk-token-chars", &mut args)?;
                let parsed = value.parse::<usize>().with_context(|| {
                    format!("failed to parse --chunk-token-chars value '{value}' as usize")
                })?;
                if parsed == 0 {
                    return Err(anyhow!("--chunk-token-chars must be greater than zero"));
                }
                config.chunk_avg_chars_per_token = Some(parsed);
            }
            "--max-chunks-per-entity" => {
                let value = take_value("--max-chunks-per-entity", &mut args)?;
                let parsed = value.parse::<usize>().with_context(|| {
                    format!("failed to parse --max-chunks-per-entity value '{value}' as usize")
                })?;
                if parsed == 0 {
                    return Err(anyhow!("--max-chunks-per-entity must be greater than zero"));
                }
                config.max_chunks_per_entity = Some(parsed);
            }
            "--embedding" => {
                let value = take_value("--embedding", &mut args)?;
                config.embedding_backend = value.parse()?;
            }
            "--embedding-model" => {
                let value = take_value("--embedding-model", &mut args)?;
                config.embedding_model = Some(value.trim().to_string());
            }
            "--cache-dir" => {
                let value = take_value("--cache-dir", &mut args)?;
                config.cache_dir = PathBuf::from(value);
            }
            "--ingestion-cache-dir" => {
                let value = take_value("--ingestion-cache-dir", &mut args)?;
                config.ingestion_cache_dir = PathBuf::from(value);
            }
            "--negative-multiplier" => {
                let value = take_value("--negative-multiplier", &mut args)?;
                let parsed = value.parse::<f32>().with_context(|| {
                    format!("failed to parse --negative-multiplier value '{value}' as f32")
                })?;
                if !(parsed.is_finite() && parsed > 0.0) {
                    return Err(anyhow!(
                        "--negative-multiplier must be a positive finite number"
                    ));
                }
                config.negative_multiplier = parsed;
            }
            "--no-rerank" => {
                config.rerank = false;
            }
            "--rerank-pool" => {
                let value = take_value("--rerank-pool", &mut args)?;
                let parsed = value.parse::<usize>().with_context(|| {
                    format!("failed to parse --rerank-pool value '{value}' as usize")
                })?;
                config.rerank_pool_size = parsed.max(1);
            }
            "--rerank-keep" => {
                let value = take_value("--rerank-keep", &mut args)?;
                let parsed = value.parse::<usize>().with_context(|| {
                    format!("failed to parse --rerank-keep value '{value}' as usize")
                })?;
                config.rerank_keep_top = parsed.max(1);
            }
            "--concurrency" => {
                let value = take_value("--concurrency", &mut args)?;
                let parsed = value.parse::<usize>().with_context(|| {
                    format!("failed to parse --concurrency value '{value}' as usize")
                })?;
                config.concurrency = parsed.max(1);
            }
            "--refresh-embeddings" => {
                config.refresh_embeddings_only = true;
            }
            "--detailed-report" => {
                config.detailed_report = true;
            }
            "--chunk-diagnostics" => {
                let value = take_value("--chunk-diagnostics", &mut args)?;
                config.chunk_diagnostics_path = Some(PathBuf::from(value));
            }
            "--inspect-question" => {
                let value = take_value("--inspect-question", &mut args)?;
                config.inspect_question = Some(value);
            }
            "--inspect-manifest" => {
                let value = take_value("--inspect-manifest", &mut args)?;
                config.inspect_manifest = Some(PathBuf::from(value));
            }
            "--inspect-db-state" => {
                let value = take_value("--inspect-db-state", &mut args)?;
                config.inspect_db_state = Some(PathBuf::from(value));
            }
            "--perf-log-json" => {
                let value = take_value("--perf-log-json", &mut args)?;
                config.perf_log_json = Some(PathBuf::from(value));
            }
            "--perf-log-dir" => {
                let value = take_value("--perf-log-dir", &mut args)?;
                config.perf_log_dir = Some(PathBuf::from(value));
            }
            "--perf-log" => {
                config.perf_log_console = true;
            }
            "--db-endpoint" => {
                let value = take_value("--db-endpoint", &mut args)?;
                config.db_endpoint = value;
            }
            "--db-user" => {
                let value = take_value("--db-user", &mut args)?;
                config.db_username = value;
            }
            "--db-pass" => {
                let value = take_value("--db-pass", &mut args)?;
                config.db_password = value;
            }
            "--db-namespace" => {
                let value = take_value("--db-namespace", &mut args)?;
                config.db_namespace = Some(value);
            }
            "--db-database" => {
                let value = take_value("--db-database", &mut args)?;
                config.db_database = Some(value);
            }
            unknown => {
                return Err(anyhow!(
                    "unknown argument '{unknown}'. Use --help to see available options."
                ));
            }
        }
    }

    if config.chunk_min_chars >= config.chunk_max_chars {
        return Err(anyhow!(
            "--chunk-min must be less than --chunk-max (got {} >= {})",
            config.chunk_min_chars,
            config.chunk_max_chars
        ));
    }

    if config.rerank && config.rerank_pool_size == 0 {
        return Err(anyhow!(
            "--rerank-pool must be greater than zero when reranking is enabled"
        ));
    }

    if config.concurrency == 0 {
        return Err(anyhow!("--concurrency must be greater than zero"));
    }

    if config.embedding_backend == EmbeddingBackend::Hashed && config.embedding_model.is_some() {
        return Err(anyhow!(
            "--embedding-model cannot be used with the 'hashed' embedding backend"
        ));
    }

    if let Some(limit) = config.limit {
        if let Some(corpus_limit) = config.corpus_limit {
            if corpus_limit < limit {
                config.corpus_limit = Some(limit);
            }
        } else {
            let default_multiplier = 10usize;
            let mut computed = limit.saturating_mul(default_multiplier);
            if computed < limit {
                computed = limit;
            }
            let max_cap = 1_000usize;
            if computed > max_cap {
                computed = max_cap;
            }
            config.corpus_limit = Some(computed);
        }
    }

    if config.perf_log_dir.is_none() {
        if let Ok(dir) = env::var("EVAL_PERF_LOG_DIR") {
            if !dir.trim().is_empty() {
                config.perf_log_dir = Some(PathBuf::from(dir));
            }
        }
    }

    if let Ok(endpoint) = env::var("EVAL_DB_ENDPOINT") {
        if !endpoint.trim().is_empty() {
            config.db_endpoint = endpoint;
        }
    }
    if let Ok(username) = env::var("EVAL_DB_USERNAME") {
        if !username.trim().is_empty() {
            config.db_username = username;
        }
    }
    if let Ok(password) = env::var("EVAL_DB_PASSWORD") {
        if !password.trim().is_empty() {
            config.db_password = password;
        }
    }
    if let Ok(ns) = env::var("EVAL_DB_NAMESPACE") {
        if !ns.trim().is_empty() {
            config.db_namespace = Some(ns);
        }
    }
    if let Ok(db) = env::var("EVAL_DB_DATABASE") {
        if !db.trim().is_empty() {
            config.db_database = Some(db);
        }
    }
    Ok(ParsedArgs { config, show_help })
}

fn take_value<'a, I>(flag: &str, iter: &mut std::iter::Peekable<I>) -> Result<String>
where
    I: Iterator<Item = String>,
{
    iter.next().ok_or_else(|| anyhow!("{flag} expects a value"))
}

pub fn print_help() {
    println!(
        "\
eval — dataset conversion, ingestion, and retrieval evaluation CLI

USAGE:
    cargo eval -- [options]
    # or
    cargo run -p eval -- [options]

OPTIONS:
    --convert-only        Convert the selected dataset and exit.
    --force, --refresh    Regenerate the converted dataset even if it already exists.
    --dataset <name>      Dataset to evaluate: 'squad' (default) or 'natural-questions'.
    --llm-mode            Enable LLM-assisted evaluation features (includes unanswerable cases).
    --slice <id|path>     Use a cached dataset slice by id (under eval/cache/slices) or by explicit path.
    --label <text>        Annotate the run; label is stored in JSON/Markdown reports.
    --query-model <name>  Override the SurrealDB system settings query model (e.g., gpt-4o-mini) for this run.
    --slice-grow <int>    Grow the slice ledger to contain at least this many answerable cases, then exit.
    --slice-offset <int>  Evaluate questions starting at this offset within the slice (default: 0).
    --reseed-slice        Ignore cached corpus state and rebuild the slice's SurrealDB corpus.
    --slice-reset-ingestion
                          Delete cached paragraph shards before rebuilding the ingestion corpus.
    --corpus-limit <int>  Cap the slice corpus size (positives + negatives). Defaults to ~10× --limit, capped at 1000.
    --raw <path>          Path to the raw dataset (defaults per dataset).
    --converted <path>    Path to write/read the converted dataset (defaults per dataset).
    --report-dir <path>   Directory to write evaluation reports (default: eval/reports).
    --k <int>             Precision@k cutoff (default: 5).
    --limit <int>         Limit the number of questions evaluated (default: 200, 0 = all).
    --sample <int>        Number of mismatches to surface in the Markdown summary (default: 5).
    --full-context        Disable context cropping when converting datasets (ingest entire documents).
    --chunk-min <int>     Minimum characters per chunk for text splitting (default: 500).
    --chunk-max <int>     Maximum characters per chunk for text splitting (default: 2000).
    --chunk-vector-take <int>
                        Override chunk vector candidate cap (default: 20).
    --chunk-fts-take <int>
                        Override chunk FTS candidate cap (default: 20).
    --chunk-token-budget <int>
                        Override chunk token budget estimate for assembly (default: 10000).
    --chunk-token-chars <int>
                        Override average characters per token used for budgeting (default: 4).
    --max-chunks-per-entity <int>
                        Override maximum chunks attached per entity (default: 4).
    --embedding <name>    Embedding backend: 'fastembed' (default) or 'hashed'.
    --embedding-model <code>
                          FastEmbed model code (defaults to crate preset when omitted).
    --cache-dir <path>    Directory for embedding caches (default: eval/cache).
    --ingestion-cache-dir <path>
                         Directory for ingestion corpora caches (default: eval/cache/ingested).
    --negative-multiplier <float>
                          Target negative-to-positive paragraph ratio for slice growth (default: 4.0).
    --refresh-embeddings  Recompute embeddings for cached corpora without re-running ingestion.
    --detailed-report     Include entity descriptions and categories in JSON reports.
    --chunk-diagnostics <path>
                        Write per-query chunk diagnostics JSONL to the provided path.
    --no-rerank           Disable the FastEmbed reranking stage (enabled by default).
    --rerank-pool <int>   Reranking engine pool size / parallelism (default: 16).
    --rerank-keep <int>   Keep top-N entities after reranking (default: 10).
    --inspect-question <id>
                        Inspect an ingestion cache question and exit (requires --inspect-manifest).
    --inspect-manifest <path>
                        Path to an ingestion cache manifest JSON for inspection mode.
    --inspect-db-state <path>
                        Optional override for the SurrealDB state.json used during inspection; defaults to the state recorded for the selected dataset slice.
    --db-endpoint <url>  SurrealDB server endpoint (use http:// or https:// to enable SurQL export/import; ws:// endpoints reuse existing namespaces but skip SurQL exports; default: ws://127.0.0.1:8000).
    --db-user <value>    SurrealDB root username (default: root_user).
    --db-pass <value>    SurrealDB root password (default: root_password).
    --db-namespace <ns>  Override the namespace used on the SurrealDB server; state.json tracks this value and the ledger case count so changing it or requesting more cases via --limit triggers a rebuild/import (default: derived from dataset).
    --db-database <db>   Override the database used on the SurrealDB server; recorded alongside namespace in state.json (default: derived from slice).
    --perf-log           Print per-stage performance timings to stdout after the run.
    --perf-log-json <path>
                        Write structured performance telemetry JSON to the provided path.
    --perf-log-dir <path>
                        Directory that receives timestamped perf JSON copies (defaults to $EVAL_PERF_LOG_DIR).

Examples:
    cargo eval -- --dataset squad --limit 10 --detailed-report
    cargo eval -- --dataset natural-questions --limit 1 --rerank-pool 1 --detailed-report

Notes:
    The latest run's JSON/Markdown reports are saved as eval/reports/latest.json and latest.md, making it easy to script automated checks.
    -h, --help            Show this help text.

Dataset defaults (from eval/manifest.yaml):
    squad               raw: eval/data/raw/squad/dev-v2.0.json
                        converted: eval/data/converted/squad-minne.json
    natural-questions   raw: eval/data/raw/nq/dev-all.jsonl
                        converted: eval/data/converted/nq-dev-minne.json
"
    );
}

pub fn ensure_parent(path: &Path) -> Result<()> {
    if let Some(parent) = path.parent() {
        std::fs::create_dir_all(parent)
            .with_context(|| format!("creating parent directory for {}", path.display()))?;
    }
    Ok(())
}