benchmarks: fin

This commit is contained in:
Per Stark
2025-12-08 21:57:53 +01:00
parent 0cb1abc6db
commit a8d10f265c
39 changed files with 774 additions and 714 deletions
+184
View File
@@ -0,0 +1,184 @@
use std::{
collections::HashMap,
fs,
path::{Path, PathBuf},
};
use anyhow::{anyhow, Context, Result};
use common::storage::{db::SurrealDbClient, types::text_chunk::TextChunk};
use crate::{args::Config, eval::connect_eval_db, corpus, snapshot::DbSnapshotState};
pub async fn inspect_question(config: &Config) -> Result<()> {
let question_id = config
.inspect_question
.as_ref()
.ok_or_else(|| anyhow!("--inspect-question is required for inspection mode"))?;
let manifest_path = config
.inspect_manifest
.as_ref()
.ok_or_else(|| anyhow!("--inspect-manifest must be provided for inspection mode"))?;
let manifest = load_manifest(manifest_path)?;
let chunk_lookup = build_chunk_lookup(&manifest);
let question = manifest
.questions
.iter()
.find(|q| q.question_id == *question_id)
.ok_or_else(|| {
anyhow!(
"question '{}' not found in manifest {}",
question_id,
manifest_path.display()
)
})?;
println!("Question: {}", question.question_text);
println!("Answers: {:?}", question.answers);
println!(
"matching_chunk_ids ({}):",
question.matching_chunk_ids.len()
);
let mut missing_in_manifest = Vec::new();
for chunk_id in &question.matching_chunk_ids {
if let Some(entry) = chunk_lookup.get(chunk_id) {
println!(
" - {} (paragraph: {})\n snippet: {}",
chunk_id, entry.paragraph_title, entry.snippet
);
} else {
println!(" - {} (missing from manifest)", chunk_id);
missing_in_manifest.push(chunk_id.clone());
}
}
if missing_in_manifest.is_empty() {
println!("All matching_chunk_ids are present in the ingestion manifest");
} else {
println!(
"Missing chunk IDs in manifest {}: {:?}",
manifest_path.display(),
missing_in_manifest
);
}
let db_state_path = config
.database
.inspect_db_state
.clone()
.unwrap_or_else(|| default_state_path(config, &manifest));
if let Some(state) = load_db_state(&db_state_path)? {
if let (Some(ns), Some(db_name)) = (state.namespace.as_deref(), state.database.as_deref()) {
match connect_eval_db(config, ns, db_name).await {
Ok(db) => match verify_chunks_in_db(&db, &question.matching_chunk_ids).await? {
MissingChunks::None => println!(
"All matching_chunk_ids exist in namespace '{}', database '{}'",
ns, db_name
),
MissingChunks::Missing(list) => println!(
"Missing chunks in namespace '{}', database '{}': {:?}",
ns, db_name, list
),
},
Err(err) => {
println!(
"Failed to connect to SurrealDB namespace '{}' / database '{}': {err}",
ns, db_name
);
}
}
} else {
println!(
"State file {} is missing namespace/database fields; skipping live DB validation",
db_state_path.display()
);
}
} else {
println!(
"State file {} not found; skipping live DB validation",
db_state_path.display()
);
}
Ok(())
}
struct ChunkEntry {
paragraph_title: String,
snippet: String,
}
fn load_manifest(path: &Path) -> Result<corpus::CorpusManifest> {
let bytes =
fs::read(path).with_context(|| format!("reading ingestion manifest {}", path.display()))?;
serde_json::from_slice(&bytes)
.with_context(|| format!("parsing ingestion manifest {}", path.display()))
}
fn build_chunk_lookup(manifest: &corpus::CorpusManifest) -> HashMap<String, ChunkEntry> {
let mut lookup = HashMap::new();
for paragraph in &manifest.paragraphs {
for chunk in &paragraph.chunks {
let snippet = chunk
.chunk
.chunk
.chars()
.take(160)
.collect::<String>()
.replace('\n', " ");
lookup.insert(
chunk.chunk.id.clone(),
ChunkEntry {
paragraph_title: paragraph.title.clone(),
snippet,
},
);
}
}
lookup
}
fn default_state_path(config: &Config, manifest: &corpus::CorpusManifest) -> PathBuf {
config
.cache_dir
.join("snapshots")
.join(&manifest.metadata.dataset_id)
.join(&manifest.metadata.slice_id)
.join("db/state.json")
}
fn load_db_state(path: &Path) -> Result<Option<DbSnapshotState>> {
if !path.exists() {
return Ok(None);
}
let bytes = fs::read(path).with_context(|| format!("reading db state {}", path.display()))?;
let state = serde_json::from_slice(&bytes)
.with_context(|| format!("parsing db state {}", path.display()))?;
Ok(Some(state))
}
enum MissingChunks {
None,
Missing(Vec<String>),
}
async fn verify_chunks_in_db(db: &SurrealDbClient, chunk_ids: &[String]) -> Result<MissingChunks> {
let mut missing = Vec::new();
for chunk_id in chunk_ids {
let exists = db
.get_item::<TextChunk>(chunk_id)
.await
.with_context(|| format!("fetching text_chunk {}", chunk_id))?
.is_some();
if !exists {
missing.push(chunk_id.clone());
}
}
if missing.is_empty() {
Ok(MissingChunks::None)
} else {
Ok(MissingChunks::Missing(missing))
}
}