Files
minne/evaluations/src/corpus/store.rs
T
Per Stark 0bba1f5a24 release: 1.0.5
fix

fix
2026-06-26 12:31:03 +02:00

687 lines
22 KiB
Rust

use std::{
collections::{HashMap, HashSet},
fs,
io::BufReader,
path::PathBuf,
};
use anyhow::{Context, Result, anyhow};
use chrono::{DateTime, Utc};
use common::storage::{
db::SurrealDbClient,
types::{
StoredObject, knowledge_entity::KnowledgeEntity,
knowledge_relationship::KnowledgeRelationship, text_chunk::TextChunk,
text_content::TextContent,
},
};
use ingestion_pipeline::{IngestionTuning, PipelineArtifacts, persist_artifacts};
use serde::Deserialize;
use tracing::{debug, warn};
use crate::datasets::{ConvertedParagraph, ConvertedQuestion};
pub const MANIFEST_VERSION: u32 = 3;
pub const PARAGRAPH_SHARD_VERSION: u32 = 3;
fn current_manifest_version() -> u32 {
MANIFEST_VERSION
}
fn current_paragraph_shard_version() -> u32 {
PARAGRAPH_SHARD_VERSION
}
fn default_chunk_min_tokens() -> usize {
500
}
fn default_chunk_max_tokens() -> usize {
2_000
}
fn default_chunk_only() -> bool {
true
}
// Reuse the pipeline's canonical embedded-artifact types so the on-disk corpus
// format and the ingestion output never drift apart.
pub use ingestion_pipeline::{EmbeddedKnowledgeEntity, EmbeddedTextChunk};
#[derive(Debug, Clone, serde::Deserialize)]
struct LegacyKnowledgeEntity {
#[serde(flatten)]
pub entity: KnowledgeEntity,
#[serde(default)]
pub embedding: Vec<f32>,
}
#[derive(Debug, Clone, serde::Deserialize)]
struct LegacyTextChunk {
#[serde(flatten)]
pub chunk: TextChunk,
#[serde(default)]
pub embedding: Vec<f32>,
}
fn deserialize_embedded_entities<'de, D>(
deserializer: D,
) -> Result<Vec<EmbeddedKnowledgeEntity>, D::Error>
where
D: serde::Deserializer<'de>,
{
#[derive(serde::Deserialize)]
#[serde(untagged)]
enum EntityInput {
Embedded(Vec<EmbeddedKnowledgeEntity>),
Legacy(Vec<LegacyKnowledgeEntity>),
}
match EntityInput::deserialize(deserializer)? {
EntityInput::Embedded(items) => Ok(items),
EntityInput::Legacy(items) => Ok(items
.into_iter()
.map(|legacy| EmbeddedKnowledgeEntity {
entity: legacy.entity,
embedding: legacy.embedding,
})
.collect()),
}
}
fn deserialize_embedded_chunks<'de, D>(deserializer: D) -> Result<Vec<EmbeddedTextChunk>, D::Error>
where
D: serde::Deserializer<'de>,
{
#[derive(serde::Deserialize)]
#[serde(untagged)]
enum ChunkInput {
Embedded(Vec<EmbeddedTextChunk>),
Legacy(Vec<LegacyTextChunk>),
}
match ChunkInput::deserialize(deserializer)? {
ChunkInput::Embedded(items) => Ok(items),
ChunkInput::Legacy(items) => Ok(items
.into_iter()
.map(|legacy| EmbeddedTextChunk {
chunk: legacy.chunk,
embedding: legacy.embedding,
})
.collect()),
}
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct CorpusManifest {
#[serde(default = "current_manifest_version")]
pub version: u32,
pub metadata: CorpusMetadata,
pub paragraphs: Vec<CorpusParagraph>,
pub questions: Vec<CorpusQuestion>,
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct NamespaceSeedRecord {
pub namespace: String,
pub database: String,
pub slice_case_count: usize,
pub seeded_at: DateTime<Utc>,
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct CorpusMetadata {
pub dataset_id: String,
pub dataset_label: String,
pub slice_id: String,
pub include_unanswerable: bool,
#[serde(default)]
pub require_verified_chunks: bool,
pub ingestion_fingerprint: String,
pub embedding_backend: String,
pub embedding_model: Option<String>,
pub embedding_dimension: usize,
pub converted_checksum: String,
pub generated_at: DateTime<Utc>,
pub paragraph_count: usize,
pub question_count: usize,
#[serde(default = "default_chunk_min_tokens")]
pub chunk_min_tokens: usize,
#[serde(default = "default_chunk_max_tokens")]
pub chunk_max_tokens: usize,
#[serde(default = "default_chunk_only")]
pub chunk_only: bool,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub namespace_seed: Option<NamespaceSeedRecord>,
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct CorpusParagraph {
pub paragraph_id: String,
pub title: String,
pub text_content: TextContent,
#[serde(deserialize_with = "deserialize_embedded_entities")]
pub entities: Vec<EmbeddedKnowledgeEntity>,
pub relationships: Vec<KnowledgeRelationship>,
#[serde(deserialize_with = "deserialize_embedded_chunks")]
pub chunks: Vec<EmbeddedTextChunk>,
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct CorpusQuestion {
pub question_id: String,
pub paragraph_id: String,
pub text_content_id: String,
pub question_text: String,
pub answers: Vec<String>,
pub is_impossible: bool,
pub matching_chunk_ids: Vec<String>,
}
pub struct CorpusHandle {
pub manifest: CorpusManifest,
pub path: PathBuf,
pub reused_ingestion: bool,
pub reused_embeddings: bool,
pub positive_reused: usize,
pub positive_ingested: usize,
pub negative_reused: usize,
pub negative_ingested: usize,
}
#[allow(
clippy::arithmetic_side_effects,
clippy::cast_possible_truncation,
clippy::cast_precision_loss,
clippy::cast_sign_loss,
clippy::indexing_slicing
)]
pub fn window_manifest(
manifest: &CorpusManifest,
offset: usize,
length: usize,
negative_multiplier: f32,
) -> Result<CorpusManifest> {
let total = manifest.questions.len();
if total == 0 {
return Err(anyhow!(
"manifest contains no questions; cannot select a window"
));
}
if offset >= total {
return Err(anyhow!(
"window offset {offset} exceeds manifest questions ({total})"
));
}
let end = (offset + length).min(total);
let questions = manifest.questions[offset..end].to_vec();
let selected_positive_ids: HashSet<_> =
questions.iter().map(|q| q.paragraph_id.clone()).collect();
let positives_all: HashSet<_> = manifest
.questions
.iter()
.map(|q| q.paragraph_id.as_str())
.collect();
let available_negatives = manifest
.paragraphs
.len()
.saturating_sub(positives_all.len());
let desired_negatives =
((selected_positive_ids.len() as f32) * negative_multiplier).ceil() as usize;
let desired_negatives = desired_negatives.min(available_negatives);
let mut paragraphs = Vec::new();
let mut negative_count = 0usize;
for paragraph in &manifest.paragraphs {
if selected_positive_ids.contains(&paragraph.paragraph_id) {
paragraphs.push(paragraph.clone());
} else if negative_count < desired_negatives {
paragraphs.push(paragraph.clone());
negative_count += 1;
}
}
let mut narrowed = manifest.clone();
narrowed.questions = questions;
narrowed.paragraphs = paragraphs;
narrowed.metadata.paragraph_count = narrowed.paragraphs.len();
narrowed.metadata.question_count = narrowed.questions.len();
Ok(narrowed)
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct ParagraphShard {
#[serde(default = "current_paragraph_shard_version")]
pub version: u32,
pub paragraph_id: String,
pub shard_path: String,
pub ingestion_fingerprint: String,
pub ingested_at: DateTime<Utc>,
pub title: String,
pub text_content: TextContent,
#[serde(deserialize_with = "deserialize_embedded_entities")]
pub entities: Vec<EmbeddedKnowledgeEntity>,
pub relationships: Vec<KnowledgeRelationship>,
#[serde(deserialize_with = "deserialize_embedded_chunks")]
pub chunks: Vec<EmbeddedTextChunk>,
#[serde(default)]
pub question_bindings: HashMap<String, Vec<String>>,
#[serde(default)]
pub embedding_backend: String,
#[serde(default)]
pub embedding_model: Option<String>,
#[serde(default)]
pub embedding_dimension: usize,
#[serde(default = "default_chunk_min_tokens")]
pub chunk_min_tokens: usize,
#[serde(default = "default_chunk_max_tokens")]
pub chunk_max_tokens: usize,
#[serde(default = "default_chunk_only")]
pub chunk_only: bool,
}
pub struct ParagraphShardStore {
base_dir: PathBuf,
}
impl ParagraphShardStore {
pub fn new(base_dir: PathBuf) -> Self {
Self { base_dir }
}
pub fn ensure_base_dir(&self) -> Result<()> {
fs::create_dir_all(&self.base_dir)
.with_context(|| format!("creating shard base dir {}", self.base_dir.display()))
}
fn resolve(&self, relative: &str) -> PathBuf {
self.base_dir.join(relative)
}
pub fn load(&self, relative: &str, fingerprint: &str) -> Result<Option<ParagraphShard>> {
let path = self.resolve(relative);
let file = match fs::File::open(&path) {
Ok(file) => file,
Err(err) if err.kind() == std::io::ErrorKind::NotFound => return Ok(None),
Err(err) => {
return Err(err).with_context(|| format!("opening shard {}", path.display()));
}
};
let reader = BufReader::new(file);
let mut shard: ParagraphShard = serde_json::from_reader(reader)
.with_context(|| format!("parsing shard {}", path.display()))?;
if shard.ingestion_fingerprint != fingerprint {
debug!(
path = %path.display(),
expected = fingerprint,
found = shard.ingestion_fingerprint,
"Shard fingerprint mismatch; will rebuild"
);
return Ok(None);
}
if shard.version != PARAGRAPH_SHARD_VERSION {
warn!(
path = %path.display(),
version = shard.version,
expected = PARAGRAPH_SHARD_VERSION,
"Upgrading shard to current version"
);
shard.version = PARAGRAPH_SHARD_VERSION;
}
shard.shard_path = relative.to_string();
Ok(Some(shard))
}
pub fn persist(&self, shard: &ParagraphShard) -> Result<()> {
let mut shard = shard.clone();
shard.version = PARAGRAPH_SHARD_VERSION;
let path = self.resolve(&shard.shard_path);
if let Some(parent) = path.parent() {
fs::create_dir_all(parent)
.with_context(|| format!("creating shard dir {}", parent.display()))?;
}
let tmp_path = path.with_extension("json.tmp");
let body = serde_json::to_vec_pretty(&shard).context("serialising paragraph shard")?;
fs::write(&tmp_path, &body)
.with_context(|| format!("writing shard tmp {}", tmp_path.display()))?;
fs::rename(&tmp_path, &path)
.with_context(|| format!("renaming shard tmp {}", path.display()))?;
Ok(())
}
}
impl ParagraphShard {
#[allow(clippy::too_many_arguments)]
pub fn new(
paragraph: &ConvertedParagraph,
shard_path: String,
ingestion_fingerprint: &str,
text_content: TextContent,
entities: Vec<EmbeddedKnowledgeEntity>,
relationships: Vec<KnowledgeRelationship>,
chunks: Vec<EmbeddedTextChunk>,
embedding_backend: &str,
embedding_model: Option<String>,
embedding_dimension: usize,
chunk_min_tokens: usize,
chunk_max_tokens: usize,
chunk_only: bool,
) -> Self {
Self {
version: PARAGRAPH_SHARD_VERSION,
paragraph_id: paragraph.id.clone(),
shard_path,
ingestion_fingerprint: ingestion_fingerprint.to_string(),
ingested_at: Utc::now(),
title: paragraph.title.clone(),
text_content,
entities,
relationships,
chunks,
question_bindings: HashMap::new(),
embedding_backend: embedding_backend.to_string(),
embedding_model,
embedding_dimension,
chunk_min_tokens,
chunk_max_tokens,
chunk_only,
}
}
pub fn to_corpus_paragraph(&self) -> CorpusParagraph {
CorpusParagraph {
paragraph_id: self.paragraph_id.clone(),
title: self.title.clone(),
text_content: self.text_content.clone(),
entities: self.entities.clone(),
relationships: self.relationships.clone(),
chunks: self.chunks.clone(),
}
}
pub fn ensure_question_binding(
&mut self,
question: &ConvertedQuestion,
) -> Result<(Vec<String>, bool)> {
if let Some(existing) = self.question_bindings.get(&question.id) {
return Ok((existing.clone(), false));
}
let chunk_ids = validate_answers(&self.text_content, &self.chunks, question)?;
self.question_bindings
.insert(question.id.clone(), chunk_ids.clone());
Ok((chunk_ids, true))
}
}
fn validate_answers(
content: &TextContent,
chunks: &[EmbeddedTextChunk],
question: &ConvertedQuestion,
) -> Result<Vec<String>> {
if question.is_impossible || question.answers.is_empty() {
return Ok(Vec::new());
}
let mut matches = std::collections::BTreeSet::new();
let mut found_any = false;
let haystack = content.text.to_ascii_lowercase();
let haystack_norm = normalize_answer_text(&haystack);
for answer in &question.answers {
let needle: String = answer.to_ascii_lowercase();
let needle_norm = normalize_answer_text(&needle);
let text_match = haystack.contains(&needle)
|| (!needle_norm.is_empty() && haystack_norm.contains(&needle_norm));
if text_match {
found_any = true;
}
for chunk in chunks {
let chunk_text = chunk.chunk.chunk.to_ascii_lowercase();
let chunk_norm = normalize_answer_text(&chunk_text);
if chunk_text.contains(&needle)
|| (!needle_norm.is_empty() && chunk_norm.contains(&needle_norm))
{
matches.insert(chunk.chunk.id().to_string());
found_any = true;
}
}
}
if found_any {
Ok(matches.into_iter().collect())
} else {
Err(anyhow!(
"expected answer for question '{}' was not found in ingested content",
question.id
))
}
}
fn normalize_answer_text(text: &str) -> String {
text.chars()
.map(|ch| {
if ch.is_alphanumeric() || ch.is_whitespace() {
ch.to_ascii_lowercase()
} else {
' '
}
})
.collect::<String>()
.split_whitespace()
.collect::<Vec<_>>()
.join(" ")
}
pub async fn seed_manifest_into_db(db: &SurrealDbClient, manifest: &CorpusManifest) -> Result<()> {
let tuning = IngestionTuning::default();
let embedding_dimensions = manifest.metadata.embedding_dimension;
let mut seen_text_content = HashSet::new();
let result = async {
for paragraph in &manifest.paragraphs {
if !seen_text_content.insert(paragraph.text_content.id.clone()) {
continue;
}
let artifacts = PipelineArtifacts {
text_content: paragraph.text_content.clone(),
entities: paragraph.entities.clone(),
relationships: paragraph.relationships.clone(),
chunks: paragraph.chunks.clone(),
};
persist_artifacts(db, &tuning, embedding_dimensions, artifacts)
.await
.map_err(|err| anyhow!("persist manifest paragraph: {err}"))?;
}
Ok(())
}
.await;
if result.is_err() {
// Best-effort cleanup to avoid leaving partial manifest data behind.
let _ = db
.client
.query(
"BEGIN TRANSACTION;
DELETE text_chunk_embedding;
DELETE knowledge_entity_embedding;
DELETE relates_to;
DELETE text_chunk;
DELETE knowledge_entity;
DELETE text_content;
COMMIT TRANSACTION;",
)
.await;
}
result
}
#[cfg(test)]
mod tests {
use super::*;
use chrono::Utc;
use common::storage::types::{
knowledge_entity::{KnowledgeEntity, KnowledgeEntityType},
text_chunk::TextChunk,
};
use uuid::Uuid;
#[allow(clippy::too_many_lines)]
fn build_manifest() -> CorpusManifest {
let user_id = "user-1".to_string();
let source_id = "source-1".to_string();
let now = Utc::now();
let text_content_id = Uuid::new_v4().to_string();
let text_content = TextContent {
id: text_content_id.clone(),
created_at: now,
updated_at: now,
text: "Hello world".to_string(),
file_info: None,
url_info: None,
context: None,
category: "test".to_string(),
user_id: user_id.clone(),
};
let entity = KnowledgeEntity {
id: Uuid::new_v4().to_string(),
created_at: now,
updated_at: now,
source_id: source_id.clone(),
name: "Entity".to_string(),
description: "A test entity".to_string(),
entity_type: KnowledgeEntityType::Document,
metadata: None,
user_id: user_id.clone(),
};
let relationship = KnowledgeRelationship::new(
format!("knowledge_entity:{}", entity.id),
format!("knowledge_entity:{}", entity.id),
user_id.clone(),
source_id.clone(),
"related".to_string(),
);
let chunk = TextChunk {
id: Uuid::new_v4().to_string(),
created_at: now,
updated_at: now,
source_id,
chunk: "chunk text".to_string(),
user_id,
};
let paragraph_one = CorpusParagraph {
paragraph_id: "p1".to_string(),
title: "Paragraph 1".to_string(),
text_content: text_content.clone(),
entities: vec![EmbeddedKnowledgeEntity {
entity: entity.clone(),
embedding: vec![0.1, 0.2, 0.3],
}],
relationships: vec![relationship],
chunks: vec![EmbeddedTextChunk {
chunk: chunk.clone(),
embedding: vec![0.3, 0.2, 0.1],
}],
};
// Duplicate content/entities should be de-duplicated by the loader.
let paragraph_two = CorpusParagraph {
paragraph_id: "p2".to_string(),
title: "Paragraph 2".to_string(),
text_content,
entities: vec![EmbeddedKnowledgeEntity {
entity,
embedding: vec![0.1, 0.2, 0.3],
}],
relationships: Vec::new(),
chunks: vec![EmbeddedTextChunk {
chunk: chunk.clone(),
embedding: vec![0.3, 0.2, 0.1],
}],
};
let question = CorpusQuestion {
question_id: "q1".to_string(),
paragraph_id: paragraph_one.paragraph_id.clone(),
text_content_id,
question_text: "What is this?".to_string(),
answers: vec!["Hello".to_string()],
is_impossible: false,
matching_chunk_ids: vec![chunk.id],
};
CorpusManifest {
version: current_manifest_version(),
metadata: CorpusMetadata {
dataset_id: "dataset".to_string(),
dataset_label: "Dataset".to_string(),
slice_id: "slice".to_string(),
include_unanswerable: false,
require_verified_chunks: false,
ingestion_fingerprint: "fp".to_string(),
embedding_backend: "test".to_string(),
embedding_model: Some("model".to_string()),
embedding_dimension: 3,
converted_checksum: "checksum".to_string(),
generated_at: now,
paragraph_count: 2,
question_count: 1,
chunk_min_tokens: 1,
chunk_max_tokens: 10,
chunk_only: false,
namespace_seed: None,
},
paragraphs: vec![paragraph_one, paragraph_two],
questions: vec![question],
}
}
#[allow(clippy::indexing_slicing, clippy::expect_used)]
#[test]
fn window_manifest_trims_questions_and_negatives() {
let manifest = build_manifest();
// Add extra negatives to simulate multiplier ~4x
let mut manifest = manifest;
let mut extra_paragraphs = Vec::new();
for _ in 0..8 {
let mut p = manifest.paragraphs[0].clone();
p.paragraph_id = Uuid::new_v4().to_string();
p.entities.clear();
p.relationships.clear();
p.chunks.clear();
extra_paragraphs.push(p);
}
manifest.paragraphs.extend(extra_paragraphs);
manifest.metadata.paragraph_count = manifest.paragraphs.len();
let windowed = window_manifest(&manifest, 0, 1, 4.0).expect("window manifest");
assert_eq!(windowed.questions.len(), 1);
// Expect roughly 4x negatives (bounded by available paragraphs)
assert!(
windowed.paragraphs.len() <= manifest.paragraphs.len(),
"windowed paragraphs should never exceed original"
);
let positive_set: std::collections::HashSet<_> = windowed
.questions
.iter()
.map(|q| q.paragraph_id.as_str())
.collect();
let positives = windowed
.paragraphs
.iter()
.filter(|p| positive_set.contains(p.paragraph_id.as_str()))
.count();
let negatives = windowed.paragraphs.len().saturating_sub(positives);
assert_eq!(positives, 1);
assert!(negatives >= 1, "should include some negatives");
}
}