use std::{ collections::HashMap, fs, path::{Path, PathBuf}, }; use anyhow::{anyhow, Context, Result}; use common::storage::{db::SurrealDbClient, types::text_chunk::TextChunk}; use crate::{args::Config, corpus, eval::connect_eval_db, snapshot::DbSnapshotState}; pub async fn inspect_question(config: &Config) -> Result<()> { let question_id = config .inspect_question .as_ref() .ok_or_else(|| anyhow!("--inspect-question is required for inspection mode"))?; let manifest_path = config .inspect_manifest .as_ref() .ok_or_else(|| anyhow!("--inspect-manifest must be provided for inspection mode"))?; let manifest = load_manifest(manifest_path)?; let chunk_lookup = build_chunk_lookup(&manifest); let question = manifest .questions .iter() .find(|q| q.question_id == *question_id) .ok_or_else(|| { anyhow!( "question '{}' not found in manifest {}", question_id, manifest_path.display() ) })?; println!("Question: {}", question.question_text); println!("Answers: {:?}", question.answers); println!( "matching_chunk_ids ({}):", question.matching_chunk_ids.len() ); let mut missing_in_manifest = Vec::new(); for chunk_id in &question.matching_chunk_ids { if let Some(entry) = chunk_lookup.get(chunk_id) { println!( " - {} (paragraph: {})\n snippet: {}", chunk_id, entry.paragraph_title, entry.snippet ); } else { println!(" - {} (missing from manifest)", chunk_id); missing_in_manifest.push(chunk_id.clone()); } } if missing_in_manifest.is_empty() { println!("All matching_chunk_ids are present in the ingestion manifest"); } else { println!( "Missing chunk IDs in manifest {}: {:?}", manifest_path.display(), missing_in_manifest ); } let db_state_path = config .database .inspect_db_state .clone() .unwrap_or_else(|| default_state_path(config, &manifest)); if let Some(state) = load_db_state(&db_state_path)? { if let (Some(ns), Some(db_name)) = (state.namespace.as_deref(), state.database.as_deref()) { match connect_eval_db(config, ns, db_name).await { Ok(db) => match verify_chunks_in_db(&db, &question.matching_chunk_ids).await? { MissingChunks::None => println!( "All matching_chunk_ids exist in namespace '{}', database '{}'", ns, db_name ), MissingChunks::Missing(list) => println!( "Missing chunks in namespace '{}', database '{}': {:?}", ns, db_name, list ), }, Err(err) => { println!( "Failed to connect to SurrealDB namespace '{}' / database '{}': {err}", ns, db_name ); } } } else { println!( "State file {} is missing namespace/database fields; skipping live DB validation", db_state_path.display() ); } } else { println!( "State file {} not found; skipping live DB validation", db_state_path.display() ); } Ok(()) } struct ChunkEntry { paragraph_title: String, snippet: String, } fn load_manifest(path: &Path) -> Result { let bytes = fs::read(path).with_context(|| format!("reading ingestion manifest {}", path.display()))?; serde_json::from_slice(&bytes) .with_context(|| format!("parsing ingestion manifest {}", path.display())) } fn build_chunk_lookup(manifest: &corpus::CorpusManifest) -> HashMap { let mut lookup = HashMap::new(); for paragraph in &manifest.paragraphs { for chunk in ¶graph.chunks { let snippet = chunk .chunk .chunk .chars() .take(160) .collect::() .replace('\n', " "); lookup.insert( chunk.chunk.id.clone(), ChunkEntry { paragraph_title: paragraph.title.clone(), snippet, }, ); } } lookup } fn default_state_path(config: &Config, manifest: &corpus::CorpusManifest) -> PathBuf { config .cache_dir .join("snapshots") .join(&manifest.metadata.dataset_id) .join(&manifest.metadata.slice_id) .join("db/state.json") } fn load_db_state(path: &Path) -> Result> { if !path.exists() { return Ok(None); } let bytes = fs::read(path).with_context(|| format!("reading db state {}", path.display()))?; let state = serde_json::from_slice(&bytes) .with_context(|| format!("parsing db state {}", path.display()))?; Ok(Some(state)) } enum MissingChunks { None, Missing(Vec), } async fn verify_chunks_in_db(db: &SurrealDbClient, chunk_ids: &[String]) -> Result { let mut missing = Vec::new(); for chunk_id in chunk_ids { let exists = db .get_item::(chunk_id) .await .with_context(|| format!("fetching text_chunk {}", chunk_id))? .is_some(); if !exists { missing.push(chunk_id.clone()); } } if missing.is_empty() { Ok(MissingChunks::None) } else { Ok(MissingChunks::Missing(missing)) } }