use std::{ collections::HashMap, fs, path::Path, }; use anyhow::{anyhow, Context, Result}; use common::storage::{db::SurrealDbClient, types::text_chunk::TextChunk}; use crate::{args::Config, corpus, db::connect_eval_db}; pub async fn inspect_question(config: &Config) -> Result<()> { let question_id = config .inspect_question .as_ref() .ok_or_else(|| anyhow!("--inspect-question is required for inspection mode"))?; let manifest_path = config .inspect_manifest .as_ref() .ok_or_else(|| anyhow!("--inspect-manifest must be provided for inspection mode"))?; let manifest = load_manifest(manifest_path)?; let chunk_lookup = build_chunk_lookup(&manifest); let question = manifest .questions .iter() .find(|q| q.question_id == *question_id) .ok_or_else(|| { anyhow!( "question '{}' not found in manifest {}", question_id, manifest_path.display() ) })?; println!("Question: {}", question.question_text); println!("Answers: {:?}", question.answers); println!( "matching_chunk_ids ({}):", question.matching_chunk_ids.len() ); let mut missing_in_manifest = Vec::new(); for chunk_id in &question.matching_chunk_ids { if let Some(entry) = chunk_lookup.get(chunk_id) { println!( " - {} (paragraph: {})\n snippet: {}", chunk_id, entry.paragraph_title, entry.snippet ); } else { println!(" - {chunk_id} (missing from manifest)"); missing_in_manifest.push(chunk_id.clone()); } } if missing_in_manifest.is_empty() { println!("All matching_chunk_ids are present in the ingestion manifest"); } else { println!( "Missing chunk IDs in manifest {}: {:?}", manifest_path.display(), missing_in_manifest ); } if let Some(seed) = manifest.metadata.namespace_seed.as_ref() { let ns = seed.namespace.as_str(); let db_name = seed.database.as_str(); match connect_eval_db(config, ns, db_name).await { Ok(db) => match verify_chunks_in_db(&db, &question.matching_chunk_ids).await? { MissingChunks::None => println!( "All matching_chunk_ids exist in namespace '{ns}', database '{db_name}'" ), MissingChunks::Missing(list) => println!( "Missing chunks in namespace '{ns}', database '{db_name}': {list:?}" ), }, Err(err) => { println!( "Failed to connect to SurrealDB namespace '{ns}' / database '{db_name}': {err}" ); } } } else { println!("Corpus manifest has no namespace seed; skipping live DB validation"); } Ok(()) } struct ChunkEntry { paragraph_title: String, snippet: String, } fn load_manifest(path: &Path) -> Result { let bytes = fs::read(path).with_context(|| format!("reading ingestion manifest {}", path.display()))?; serde_json::from_slice(&bytes) .with_context(|| format!("parsing ingestion manifest {}", path.display())) } fn build_chunk_lookup(manifest: &corpus::CorpusManifest) -> HashMap { let mut lookup = HashMap::new(); for paragraph in &manifest.paragraphs { for chunk in ¶graph.chunks { let snippet = chunk .chunk .chunk .chars() .take(160) .collect::() .replace('\n', " "); lookup.insert( chunk.chunk.id.clone(), ChunkEntry { paragraph_title: paragraph.title.clone(), snippet, }, ); } } lookup } enum MissingChunks { None, Missing(Vec), } async fn verify_chunks_in_db(db: &SurrealDbClient, chunk_ids: &[String]) -> Result { let mut missing = Vec::new(); for chunk_id in chunk_ids { let exists = db .get_item::(chunk_id) .await .with_context(|| format!("fetching text_chunk {chunk_id}"))? .is_some(); if !exists { missing.push(chunk_id.clone()); } } if missing.is_empty() { Ok(MissingChunks::None) } else { Ok(MissingChunks::Missing(missing)) } }