mirror of
https://github.com/perstarkse/minne.git
synced 2026-06-21 14:09:33 +02:00
chore: ingestion-pipeline refactor, sort technical debt, rustfmt
This commit is contained in:
@@ -9,8 +9,7 @@ pub use orchestrator::{
|
||||
};
|
||||
pub use store::{
|
||||
seed_manifest_into_db, window_manifest, CorpusHandle, CorpusManifest, CorpusMetadata,
|
||||
CorpusQuestion, EmbeddedKnowledgeEntity, EmbeddedTextChunk, ParagraphShard,
|
||||
ParagraphShardStore, MANIFEST_VERSION,
|
||||
CorpusQuestion, ParagraphShard, ParagraphShardStore, MANIFEST_VERSION,
|
||||
};
|
||||
|
||||
pub fn make_ingestion_config(config: &crate::args::Config) -> ingestion_pipeline::IngestionConfig {
|
||||
|
||||
@@ -33,8 +33,7 @@ use crate::{
|
||||
|
||||
use crate::corpus::{
|
||||
CorpusCacheConfig, CorpusHandle, CorpusManifest, CorpusMetadata, CorpusQuestion,
|
||||
EmbeddedKnowledgeEntity, EmbeddedTextChunk, ParagraphShard, ParagraphShardStore,
|
||||
MANIFEST_VERSION,
|
||||
ParagraphShard, ParagraphShardStore, MANIFEST_VERSION,
|
||||
};
|
||||
|
||||
const INGESTION_SPEC_VERSION: u32 = 2;
|
||||
@@ -273,10 +272,19 @@ pub async fn ensure_corpus(
|
||||
.context("shard record missing after ingestion run")?;
|
||||
if cache.refresh_embeddings_only || shard_record.needs_reembed {
|
||||
// Embeddings are now generated by the pipeline using FastEmbed - no need to re-embed
|
||||
shard_record.shard.ingestion_fingerprint.clone_from(&ingestion_fingerprint);
|
||||
shard_record
|
||||
.shard
|
||||
.ingestion_fingerprint
|
||||
.clone_from(&ingestion_fingerprint);
|
||||
shard_record.shard.ingested_at = Utc::now();
|
||||
shard_record.shard.embedding_backend.clone_from(&embedding_backend_label);
|
||||
shard_record.shard.embedding_model.clone_from(&embedding_model_code);
|
||||
shard_record
|
||||
.shard
|
||||
.embedding_backend
|
||||
.clone_from(&embedding_backend_label);
|
||||
shard_record
|
||||
.shard
|
||||
.embedding_model
|
||||
.clone_from(&embedding_model_code);
|
||||
shard_record.shard.embedding_dimension = embedding_dimension;
|
||||
shard_record.dirty = true;
|
||||
shard_record.needs_reembed = false;
|
||||
@@ -543,31 +551,16 @@ async fn ingest_single_paragraph(
|
||||
let task = IngestionTask::new(payload, user_id.to_string());
|
||||
match pipeline.produce_artifacts(&task).await {
|
||||
Ok(artifacts) => {
|
||||
let entities: Vec<EmbeddedKnowledgeEntity> = artifacts
|
||||
.entities
|
||||
.into_iter()
|
||||
.map(|e| EmbeddedKnowledgeEntity {
|
||||
entity: e.entity,
|
||||
embedding: e.embedding,
|
||||
})
|
||||
.collect();
|
||||
let chunks: Vec<EmbeddedTextChunk> = artifacts
|
||||
.chunks
|
||||
.into_iter()
|
||||
.map(|c| EmbeddedTextChunk {
|
||||
chunk: c.chunk,
|
||||
embedding: c.embedding,
|
||||
})
|
||||
.collect();
|
||||
// No need to reembed - pipeline now uses FastEmbed internally
|
||||
// Artifacts already carry the shared `Embedded*` types and FastEmbed
|
||||
// embeddings, so they can be persisted to the shard without re-mapping.
|
||||
let mut shard = ParagraphShard::new(
|
||||
paragraph,
|
||||
request.shard_path,
|
||||
ingestion_fingerprint,
|
||||
artifacts.text_content,
|
||||
entities,
|
||||
artifacts.entities,
|
||||
artifacts.relationships,
|
||||
chunks,
|
||||
artifacts.chunks,
|
||||
&embedding_backend,
|
||||
embedding_model.clone(),
|
||||
embedding_dimension,
|
||||
|
||||
@@ -54,17 +54,9 @@ fn default_chunk_only() -> bool {
|
||||
false
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
|
||||
pub struct EmbeddedKnowledgeEntity {
|
||||
pub entity: KnowledgeEntity,
|
||||
pub embedding: Vec<f32>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
|
||||
pub struct EmbeddedTextChunk {
|
||||
pub chunk: TextChunk,
|
||||
pub embedding: Vec<f32>,
|
||||
}
|
||||
// Reuse the pipeline's canonical embedded-artifact types so the on-disk corpus
|
||||
// format and the ingestion output never drift apart.
|
||||
pub use ingestion_pipeline::{EmbeddedKnowledgeEntity, EmbeddedTextChunk};
|
||||
|
||||
#[derive(Debug, Clone, serde::Deserialize)]
|
||||
struct LegacyKnowledgeEntity {
|
||||
|
||||
@@ -11,7 +11,11 @@ use tracing::warn;
|
||||
|
||||
use super::{ConvertedParagraph, ConvertedQuestion};
|
||||
|
||||
#[allow(clippy::too_many_lines, clippy::arithmetic_side_effects, clippy::cast_sign_loss)]
|
||||
#[allow(
|
||||
clippy::too_many_lines,
|
||||
clippy::arithmetic_side_effects,
|
||||
clippy::cast_sign_loss
|
||||
)]
|
||||
pub fn convert_nq(
|
||||
raw_path: &Path,
|
||||
include_unanswerable: bool,
|
||||
|
||||
@@ -166,8 +166,7 @@ async fn async_main() -> anyhow::Result<()> {
|
||||
);
|
||||
|
||||
if parsed.config.slice_grow.is_some() {
|
||||
eval::grow_slice(&dataset, &parsed.config)
|
||||
.context("growing slice ledger")?;
|
||||
eval::grow_slice(&dataset, &parsed.config).context("growing slice ledger")?;
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
|
||||
@@ -14,7 +14,7 @@ use common::{
|
||||
utils::embedding::EmbeddingProvider,
|
||||
};
|
||||
use retrieval_pipeline::{
|
||||
pipeline::{StageTimings, RetrievalConfig},
|
||||
pipeline::{RetrievalConfig, StageTimings},
|
||||
reranking::RerankerPool,
|
||||
};
|
||||
|
||||
@@ -122,11 +122,15 @@ impl<'a> EvaluationContext<'a> {
|
||||
}
|
||||
|
||||
pub fn slice(&self) -> Result<&slice::ResolvedSlice<'a>> {
|
||||
self.slice.as_ref().ok_or_else(|| anyhow!("slice has not been prepared"))
|
||||
self.slice
|
||||
.as_ref()
|
||||
.ok_or_else(|| anyhow!("slice has not been prepared"))
|
||||
}
|
||||
|
||||
pub fn db(&self) -> Result<&SurrealDbClient> {
|
||||
self.db.as_ref().ok_or_else(|| anyhow!("database connection missing"))
|
||||
self.db
|
||||
.as_ref()
|
||||
.ok_or_else(|| anyhow!("database connection missing"))
|
||||
}
|
||||
|
||||
pub fn descriptor(&self) -> Result<&snapshot::Descriptor> {
|
||||
@@ -142,15 +146,23 @@ impl<'a> EvaluationContext<'a> {
|
||||
}
|
||||
|
||||
pub fn openai_client(&self) -> Result<Arc<Client<async_openai::config::OpenAIConfig>>> {
|
||||
Ok(Arc::clone(self.openai_client.as_ref().ok_or_else(|| anyhow!("openai client missing"))?))
|
||||
Ok(Arc::clone(
|
||||
self.openai_client
|
||||
.as_ref()
|
||||
.ok_or_else(|| anyhow!("openai client missing"))?,
|
||||
))
|
||||
}
|
||||
|
||||
pub fn corpus_handle(&self) -> Result<&corpus::CorpusHandle> {
|
||||
self.corpus_handle.as_ref().ok_or_else(|| anyhow!("corpus handle missing"))
|
||||
self.corpus_handle
|
||||
.as_ref()
|
||||
.ok_or_else(|| anyhow!("corpus handle missing"))
|
||||
}
|
||||
|
||||
pub fn evaluation_user(&self) -> Result<&User> {
|
||||
self.eval_user.as_ref().ok_or_else(|| anyhow!("evaluation user missing"))
|
||||
self.eval_user
|
||||
.as_ref()
|
||||
.ok_or_else(|| anyhow!("evaluation user missing"))
|
||||
}
|
||||
|
||||
#[allow(clippy::arithmetic_side_effects)]
|
||||
@@ -168,7 +180,8 @@ impl<'a> EvaluationContext<'a> {
|
||||
}
|
||||
|
||||
pub fn into_summary(self) -> Result<EvaluationSummary> {
|
||||
self.summary.ok_or_else(|| anyhow!("evaluation summary missing"))
|
||||
self.summary
|
||||
.ok_or_else(|| anyhow!("evaluation summary missing"))
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -10,7 +10,7 @@ use crate::eval::{
|
||||
CaseSummary, RetrievedSummary,
|
||||
};
|
||||
use retrieval_pipeline::{
|
||||
pipeline::{self, StageTimings, RetrievalConfig},
|
||||
pipeline::{self, RetrievalConfig, StageTimings},
|
||||
reranking::RerankerPool,
|
||||
};
|
||||
use tokio::sync::Semaphore;
|
||||
@@ -169,10 +169,10 @@ pub(crate) async fn run_queries(
|
||||
let query_start = Instant::now();
|
||||
|
||||
debug!(question_id = %question_id, "Evaluating query");
|
||||
let query_embedding =
|
||||
embedding_provider.embed(&question).await.with_context(|| {
|
||||
format!("generating embedding for question {question_id}")
|
||||
})?;
|
||||
let query_embedding = embedding_provider
|
||||
.embed(&question)
|
||||
.await
|
||||
.with_context(|| format!("generating embedding for question {question_id}"))?;
|
||||
let reranker = match rerank_pool.as_ref() {
|
||||
Some(pool) => pool.checkout().await,
|
||||
None => None,
|
||||
@@ -204,8 +204,10 @@ pub(crate) async fn run_queries(
|
||||
let mut match_rank = None;
|
||||
let answers_lower: Vec<String> =
|
||||
answers.iter().map(|ans| ans.to_ascii_lowercase()).collect();
|
||||
let expected_chunk_ids_set: HashSet<&str> =
|
||||
expected_chunk_ids.iter().map(std::string::String::as_str).collect();
|
||||
let expected_chunk_ids_set: HashSet<&str> = expected_chunk_ids
|
||||
.iter()
|
||||
.map(std::string::String::as_str)
|
||||
.collect();
|
||||
let chunk_id_required = has_verified_chunks;
|
||||
let mut entity_hit = false;
|
||||
let mut chunk_text_hit = false;
|
||||
@@ -304,15 +306,12 @@ pub(crate) async fn run_queries(
|
||||
None
|
||||
};
|
||||
|
||||
Ok::<
|
||||
(
|
||||
usize,
|
||||
CaseSummary,
|
||||
Option<CaseDiagnostics>,
|
||||
StageTimings,
|
||||
),
|
||||
anyhow::Error,
|
||||
>((idx, summary, diagnostics, stage_timings))
|
||||
Ok::<(usize, CaseSummary, Option<CaseDiagnostics>, StageTimings), anyhow::Error>((
|
||||
idx,
|
||||
summary,
|
||||
diagnostics,
|
||||
stage_timings,
|
||||
))
|
||||
}
|
||||
})
|
||||
.buffer_unordered(concurrency)
|
||||
|
||||
@@ -13,7 +13,11 @@ use super::super::{
|
||||
};
|
||||
use super::{map_guard_error, StageResult};
|
||||
|
||||
#[allow(clippy::too_many_lines, clippy::arithmetic_side_effects, clippy::cast_precision_loss)]
|
||||
#[allow(
|
||||
clippy::too_many_lines,
|
||||
clippy::arithmetic_side_effects,
|
||||
clippy::cast_precision_loss
|
||||
)]
|
||||
pub(crate) async fn summarize(
|
||||
machine: EvaluationMachine<(), QueriesFinished>,
|
||||
ctx: &mut EvaluationContext<'_>,
|
||||
|
||||
+147
-26
@@ -403,11 +403,20 @@ pub fn write_reports(
|
||||
})
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_lines, clippy::write_with_newline, clippy::unwrap_used)]
|
||||
#[allow(
|
||||
clippy::too_many_lines,
|
||||
clippy::write_with_newline,
|
||||
clippy::unwrap_used
|
||||
)]
|
||||
fn render_markdown(report: &EvaluationReport) -> String {
|
||||
let mut md = String::new();
|
||||
|
||||
write!(md, "# Retrieval Evaluation (k={})\\n\\n", report.retrieval.k).unwrap();
|
||||
write!(
|
||||
md,
|
||||
"# Retrieval Evaluation (k={})\\n\\n",
|
||||
report.retrieval.k
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
md.push_str("## Overview\\n\\n");
|
||||
md.push_str("| Metric | Value |\\n| --- | --- |\\n");
|
||||
@@ -424,34 +433,94 @@ fn render_markdown(report: &EvaluationReport) -> String {
|
||||
)
|
||||
.unwrap();
|
||||
write!(md, "| Total Cases | {} |\\n", report.overview.total_cases).unwrap();
|
||||
write!(md, "| Filtered Questions | {} |\\n", report.overview.filtered_questions).unwrap();
|
||||
write!(
|
||||
md,
|
||||
"| Filtered Questions | {} |\\n",
|
||||
report.overview.filtered_questions
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
md.push_str("\\n## Dataset & Slice\\n\\n");
|
||||
md.push_str("| Metric | Value |\\n| --- | --- |\\n");
|
||||
write!(md, "| Dataset | {} (`{}`) |\\n", report.dataset.label, report.dataset.id).unwrap();
|
||||
write!(
|
||||
md,
|
||||
"| Dataset | {} (`{}`) |\\n",
|
||||
report.dataset.label, report.dataset.id
|
||||
)
|
||||
.unwrap();
|
||||
write!(md, "| Dataset Source | {} |\\n", report.dataset.source).unwrap();
|
||||
write!(md, "| Includes Unanswerable | {} |\\n", bool_badge(report.dataset.includes_unanswerable)).unwrap();
|
||||
write!(md, "| Require Verified Chunks | {} |\\n", bool_badge(report.dataset.require_verified_chunks)).unwrap();
|
||||
write!(
|
||||
md,
|
||||
"| Includes Unanswerable | {} |\\n",
|
||||
bool_badge(report.dataset.includes_unanswerable)
|
||||
)
|
||||
.unwrap();
|
||||
write!(
|
||||
md,
|
||||
"| Require Verified Chunks | {} |\\n",
|
||||
bool_badge(report.dataset.require_verified_chunks)
|
||||
)
|
||||
.unwrap();
|
||||
let embedding_label = if let Some(model) = report.dataset.embedding_model.as_ref() {
|
||||
format!("{} ({model})", report.dataset.embedding_backend)
|
||||
} else {
|
||||
report.dataset.embedding_backend.clone()
|
||||
};
|
||||
write!(md, "| Embedding | {embedding_label} |\\n").unwrap();
|
||||
write!(md, "| Embedding Dim | {} |\\n", report.dataset.embedding_dimension).unwrap();
|
||||
write!(
|
||||
md,
|
||||
"| Embedding Dim | {} |\\n",
|
||||
report.dataset.embedding_dimension
|
||||
)
|
||||
.unwrap();
|
||||
write!(md, "| Slice ID | `{}` |\\n", report.slice.id).unwrap();
|
||||
write!(md, "| Slice Seed | {} |\\n", report.slice.seed).unwrap();
|
||||
write!(md, "| Slice Window (offset/length) | {}/{} |\\n", report.slice.window_offset, report.slice.window_length).unwrap();
|
||||
write!(md, "| Slice Questions (window/ledger) | {}/{} |\\n", report.slice.slice_cases, report.slice.ledger_total_cases).unwrap();
|
||||
write!(md, "| Slice Positives / Negatives | {}/{} |\\n", report.slice.positives, report.slice.negatives).unwrap();
|
||||
write!(md, "| Slice Paragraphs | {} |\\n", report.slice.total_paragraphs).unwrap();
|
||||
write!(md, "| Negative Multiplier | {:.2} |\\n", report.slice.negative_multiplier).unwrap();
|
||||
write!(
|
||||
md,
|
||||
"| Slice Window (offset/length) | {}/{} |\\n",
|
||||
report.slice.window_offset, report.slice.window_length
|
||||
)
|
||||
.unwrap();
|
||||
write!(
|
||||
md,
|
||||
"| Slice Questions (window/ledger) | {}/{} |\\n",
|
||||
report.slice.slice_cases, report.slice.ledger_total_cases
|
||||
)
|
||||
.unwrap();
|
||||
write!(
|
||||
md,
|
||||
"| Slice Positives / Negatives | {}/{} |\\n",
|
||||
report.slice.positives, report.slice.negatives
|
||||
)
|
||||
.unwrap();
|
||||
write!(
|
||||
md,
|
||||
"| Slice Paragraphs | {} |\\n",
|
||||
report.slice.total_paragraphs
|
||||
)
|
||||
.unwrap();
|
||||
write!(
|
||||
md,
|
||||
"| Negative Multiplier | {:.2} |\\n",
|
||||
report.slice.negative_multiplier
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
md.push_str("\\n## Retrieval Metrics\\n\\n");
|
||||
md.push_str("| Metric | Value |\\n| --- | --- |\\n");
|
||||
write!(md, "| Cases | {} |\\n", report.retrieval.cases).unwrap();
|
||||
write!(md, "| Correct@{} | {}/{} |\\n", report.retrieval.k, report.retrieval.correct, report.retrieval.cases).unwrap();
|
||||
write!(md, "| Precision@{} | {:.3} |\\n", report.retrieval.k, report.retrieval.precision).unwrap();
|
||||
write!(
|
||||
md,
|
||||
"| Correct@{} | {}/{} |\\n",
|
||||
report.retrieval.k, report.retrieval.correct, report.retrieval.cases
|
||||
)
|
||||
.unwrap();
|
||||
write!(
|
||||
md,
|
||||
"| Precision@{} | {:.3} |\\n",
|
||||
report.retrieval.k, report.retrieval.precision
|
||||
)
|
||||
.unwrap();
|
||||
write!(
|
||||
md,
|
||||
"| Precision@1/2/3 | {:.3} / {:.3} / {:.3} |\\n",
|
||||
@@ -462,7 +531,12 @@ fn render_markdown(report: &EvaluationReport) -> String {
|
||||
.unwrap();
|
||||
write!(md, "| MRR | {:.3} |\\n", report.retrieval.mrr).unwrap();
|
||||
write!(md, "| NDCG | {:.3} |\\n", report.retrieval.average_ndcg).unwrap();
|
||||
write!(md, "| Latency Avg / P50 / P95 (ms) | {:.1} / {} / {} |\\n", report.retrieval.latency.avg, report.retrieval.latency.p50, report.retrieval.latency.p95).unwrap();
|
||||
write!(
|
||||
md,
|
||||
"| Latency Avg / P50 / P95 (ms) | {:.1} / {} / {} |\\n",
|
||||
report.retrieval.latency.avg, report.retrieval.latency.p50, report.retrieval.latency.p95
|
||||
)
|
||||
.unwrap();
|
||||
write!(
|
||||
md,
|
||||
"| Resolve entities | {} |\\n",
|
||||
@@ -473,8 +547,14 @@ fn render_markdown(report: &EvaluationReport) -> String {
|
||||
if report.retrieval.rerank_enabled {
|
||||
let pool = report
|
||||
.retrieval
|
||||
.rerank_pool_size.map_or_else(|| "?".into(), |size| size.to_string());
|
||||
write!(md, "| Rerank | enabled (pool {pool}, keep top {}) |\\n", report.retrieval.rerank_keep_top).unwrap();
|
||||
.rerank_pool_size
|
||||
.map_or_else(|| "?".into(), |size| size.to_string());
|
||||
write!(
|
||||
md,
|
||||
"| Rerank | enabled (pool {pool}, keep top {}) |\\n",
|
||||
report.retrieval.rerank_keep_top
|
||||
)
|
||||
.unwrap();
|
||||
} else {
|
||||
md.push_str("| Rerank | disabled |\\n");
|
||||
}
|
||||
@@ -489,8 +569,18 @@ fn render_markdown(report: &EvaluationReport) -> String {
|
||||
|
||||
md.push_str("\\n## Performance\\n\\n");
|
||||
md.push_str("| Metric | Value |\\n| --- | --- |\\n");
|
||||
write!(md, "| OpenAI Base URL | {} |\\n", report.performance.openai_base_url).unwrap();
|
||||
write!(md, "| Ingestion Duration | {} ms |\\n", report.performance.ingestion_ms).unwrap();
|
||||
write!(
|
||||
md,
|
||||
"| OpenAI Base URL | {} |\\n",
|
||||
report.performance.openai_base_url
|
||||
)
|
||||
.unwrap();
|
||||
write!(
|
||||
md,
|
||||
"| Ingestion Duration | {} ms |\\n",
|
||||
report.performance.ingestion_ms
|
||||
)
|
||||
.unwrap();
|
||||
if let Some(seed) = report.performance.namespace_seed_ms {
|
||||
write!(md, "| Namespace Seed | {seed} ms |\\n").unwrap();
|
||||
}
|
||||
@@ -504,14 +594,44 @@ fn render_markdown(report: &EvaluationReport) -> String {
|
||||
}
|
||||
)
|
||||
.unwrap();
|
||||
write!(md, "| Corpus Paragraphs | {} |\\n", report.performance.corpus_paragraphs).unwrap();
|
||||
write!(
|
||||
md,
|
||||
"| Corpus Paragraphs | {} |\\n",
|
||||
report.performance.corpus_paragraphs
|
||||
)
|
||||
.unwrap();
|
||||
if report.detailed_report {
|
||||
write!(md, "| Ingestion Cache | `{}` |\\n", report.performance.ingestion_cache_path).unwrap();
|
||||
write!(md, "| Ingestion Reused | {} |\\n", bool_badge(report.performance.ingestion_reused)).unwrap();
|
||||
write!(md, "| Embeddings Reused | {} |\\n", bool_badge(report.performance.embeddings_reused)).unwrap();
|
||||
write!(
|
||||
md,
|
||||
"| Ingestion Cache | `{}` |\\n",
|
||||
report.performance.ingestion_cache_path
|
||||
)
|
||||
.unwrap();
|
||||
write!(
|
||||
md,
|
||||
"| Ingestion Reused | {} |\\n",
|
||||
bool_badge(report.performance.ingestion_reused)
|
||||
)
|
||||
.unwrap();
|
||||
write!(
|
||||
md,
|
||||
"| Embeddings Reused | {} |\\n",
|
||||
bool_badge(report.performance.embeddings_reused)
|
||||
)
|
||||
.unwrap();
|
||||
}
|
||||
write!(md, "| Positives Cached | {} |\\n", report.performance.positive_paragraphs_reused).unwrap();
|
||||
write!(md, "| Negatives Cached | {} |\\n", report.performance.negative_paragraphs_reused).unwrap();
|
||||
write!(
|
||||
md,
|
||||
"| Positives Cached | {} |\\n",
|
||||
report.performance.positive_paragraphs_reused
|
||||
)
|
||||
.unwrap();
|
||||
write!(
|
||||
md,
|
||||
"| Negatives Cached | {} |\\n",
|
||||
report.performance.negative_paragraphs_reused
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
md.push_str("\\n## Retrieval Stage Timings\\n\\n");
|
||||
md.push_str("| Stage | Avg (ms) | P50 (ms) | P95 (ms) |\\n| --- | --- | --- | --- |\\n");
|
||||
@@ -583,7 +703,8 @@ fn render_markdown(report: &EvaluationReport) -> String {
|
||||
for case in &report.llm_cases {
|
||||
let retrieved = render_retrieved(&case.retrieved);
|
||||
let rank = case
|
||||
.match_rank.map_or_else(|| "-".into(), |rank| rank.to_string());
|
||||
.match_rank
|
||||
.map_or_else(|| "-".into(), |rank| rank.to_string());
|
||||
write!(
|
||||
md,
|
||||
"| `{}` | {} | {} | {} |\\n",
|
||||
|
||||
+31
-19
@@ -99,10 +99,13 @@ fn sanitize_identifier(input: &str) -> String {
|
||||
let mut hasher = Sha256::new();
|
||||
hasher.update(input.as_bytes());
|
||||
let digest = hasher.finalize();
|
||||
digest.iter().take(6).fold(String::with_capacity(12), |mut s, b| {
|
||||
let _ = write!(s, "{b:02x}");
|
||||
s
|
||||
})
|
||||
digest
|
||||
.iter()
|
||||
.take(6)
|
||||
.fold(String::with_capacity(12), |mut s, b| {
|
||||
let _ = write!(s, "{b:02x}");
|
||||
s
|
||||
})
|
||||
} else {
|
||||
trimmed
|
||||
}
|
||||
@@ -127,7 +130,9 @@ pub struct SliceWindow<'a> {
|
||||
|
||||
impl SliceWindow<'_> {
|
||||
pub fn positive_ids(&self) -> impl Iterator<Item = &str> {
|
||||
self.positive_paragraph_ids.iter().map(std::string::String::as_str)
|
||||
self.positive_paragraph_ids
|
||||
.iter()
|
||||
.map(std::string::String::as_str)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -169,7 +174,10 @@ impl DatasetIndex {
|
||||
.paragraph_by_id
|
||||
.get(id)
|
||||
.ok_or_else(|| anyhow!("slice references unknown paragraph '{id}'"))?;
|
||||
dataset.paragraphs.get(*idx).ok_or_else(|| anyhow!("paragraph index out of bounds"))
|
||||
dataset
|
||||
.paragraphs
|
||||
.get(*idx)
|
||||
.ok_or_else(|| anyhow!("paragraph index out of bounds"))
|
||||
}
|
||||
|
||||
fn question<'a>(
|
||||
@@ -181,7 +189,9 @@ impl DatasetIndex {
|
||||
.question_by_id
|
||||
.get(question_id)
|
||||
.ok_or_else(|| anyhow!("slice references unknown question '{question_id}'"))?;
|
||||
let paragraph = dataset.paragraphs.get(*p_idx)
|
||||
let paragraph = dataset
|
||||
.paragraphs
|
||||
.get(*p_idx)
|
||||
.ok_or_else(|| anyhow!("paragraph index out of bounds for question '{question_id}'"))?;
|
||||
let question = paragraph
|
||||
.questions
|
||||
@@ -318,9 +328,7 @@ pub fn resolve_slice<'a>(
|
||||
.is_some_and(|manifest| manifest.version != SLICE_VERSION)
|
||||
{
|
||||
warn!(
|
||||
slice = manifest
|
||||
.as_ref()
|
||||
.map_or("unknown", |m| m.slice_id.as_str()),
|
||||
slice = manifest.as_ref().map_or("unknown", |m| m.slice_id.as_str()),
|
||||
found = manifest.as_ref().map_or(0, |m| m.version),
|
||||
expected = SLICE_VERSION,
|
||||
"Slice manifest version mismatch; regenerating"
|
||||
@@ -919,7 +927,11 @@ fn ensure_shard_paths(manifest: &mut SliceManifest) -> bool {
|
||||
changed
|
||||
}
|
||||
|
||||
#[allow(clippy::cast_precision_loss, clippy::cast_possible_truncation, clippy::cast_sign_loss)]
|
||||
#[allow(
|
||||
clippy::cast_precision_loss,
|
||||
clippy::cast_possible_truncation,
|
||||
clippy::cast_sign_loss
|
||||
)]
|
||||
fn desired_negative_target(
|
||||
positive_count: usize,
|
||||
requested_corpus: usize,
|
||||
@@ -1007,10 +1019,13 @@ fn compute_slice_id(key: &SliceKey<'_>) -> Result<String> {
|
||||
let mut hasher = Sha256::new();
|
||||
hasher.update(payload);
|
||||
let digest = hasher.finalize();
|
||||
Ok(digest.iter().take(16).fold(String::with_capacity(32), |mut s, b| {
|
||||
let _ = write!(s, "{b:02x}");
|
||||
s
|
||||
}))
|
||||
Ok(digest
|
||||
.iter()
|
||||
.take(16)
|
||||
.fold(String::with_capacity(32), |mut s, b| {
|
||||
let _ = write!(s, "{b:02x}");
|
||||
s
|
||||
}))
|
||||
}
|
||||
|
||||
#[allow(clippy::indexing_slicing)]
|
||||
@@ -1050,10 +1065,7 @@ impl<'a> From<&'a Config> for SliceConfig<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn slice_config_with_limit(
|
||||
config: &Config,
|
||||
limit_override: Option<usize>,
|
||||
) -> SliceConfig<'_> {
|
||||
pub fn slice_config_with_limit(config: &Config, limit_override: Option<usize>) -> SliceConfig<'_> {
|
||||
SliceConfig {
|
||||
cache_dir: config.cache_dir.as_path(),
|
||||
force_convert: config.force_convert,
|
||||
|
||||
@@ -409,7 +409,10 @@ pub fn build_case_diagnostics(
|
||||
candidates: &[EvaluationCandidate],
|
||||
pipeline_stats: Option<Diagnostics>,
|
||||
) -> CaseDiagnostics {
|
||||
let expected_set: HashSet<&str> = expected_chunk_ids.iter().map(std::string::String::as_str).collect();
|
||||
let expected_set: HashSet<&str> = expected_chunk_ids
|
||||
.iter()
|
||||
.map(std::string::String::as_str)
|
||||
.collect();
|
||||
let mut seen_chunks: HashSet<String> = HashSet::new();
|
||||
let mut attached_chunk_ids = Vec::new();
|
||||
let mut entity_diagnostics = Vec::new();
|
||||
|
||||
Reference in New Issue
Block a user