mirror of
https://github.com/perstarkse/minne.git
synced 2026-03-27 03:41:32 +01:00
beir-rff
This commit is contained in:
@@ -20,7 +20,7 @@ datasets:
|
||||
category: "Natural Questions"
|
||||
entity_suffix: "Natural Questions"
|
||||
source_prefix: "nq"
|
||||
raw: "data/raw/nq/dev-all.jsonl"
|
||||
raw: "data/raw/nq-dev/dev-all.jsonl"
|
||||
converted: "data/converted/nq-dev-minne.json"
|
||||
include_unanswerable: true
|
||||
slices:
|
||||
@@ -42,7 +42,7 @@ datasets:
|
||||
slices:
|
||||
- id: beir-mix-600
|
||||
label: "BEIR mix (600)"
|
||||
description: "Balanced slice across FEVER, FiQA, HotpotQA, NFCorpus, Quora, TREC-COVID"
|
||||
description: "Balanced slice across FEVER, FiQA, HotpotQA, NFCorpus, Quora, TREC-COVID, SciFact, NQ-BEIR"
|
||||
limit: 600
|
||||
corpus_limit: 6000
|
||||
seed: 0x5eed2025
|
||||
@@ -136,3 +136,33 @@ datasets:
|
||||
limit: 200
|
||||
corpus_limit: 5000
|
||||
seed: 0x5eed2025
|
||||
- id: scifact
|
||||
label: "SciFact (BEIR)"
|
||||
category: "SciFact"
|
||||
entity_suffix: "SciFact"
|
||||
source_prefix: "scifact"
|
||||
raw: "data/raw/scifact"
|
||||
converted: "data/converted/scifact-minne.json"
|
||||
include_unanswerable: false
|
||||
slices:
|
||||
- id: scifact-test-200
|
||||
label: "SciFact test (200)"
|
||||
description: "200-case slice from BEIR test qrels"
|
||||
limit: 200
|
||||
corpus_limit: 3000
|
||||
seed: 0x5eed2025
|
||||
- id: nq-beir
|
||||
label: "Natural Questions (BEIR)"
|
||||
category: "Natural Questions"
|
||||
entity_suffix: "Natural Questions"
|
||||
source_prefix: "nq-beir"
|
||||
raw: "data/raw/nq"
|
||||
converted: "data/converted/nq-beir-minne.json"
|
||||
include_unanswerable: false
|
||||
slices:
|
||||
- id: nq-beir-test-200
|
||||
label: "NQ (BEIR) test (200)"
|
||||
description: "200-case slice from BEIR test qrels"
|
||||
limit: 200
|
||||
corpus_limit: 5000
|
||||
seed: 0x5eed2025
|
||||
|
||||
@@ -84,6 +84,26 @@ pub struct RetrievalSettings {
|
||||
#[arg(long, default_value_t = 5)]
|
||||
pub chunk_result_cap: usize,
|
||||
|
||||
/// Reciprocal rank fusion k value for revised chunk merging
|
||||
#[arg(long)]
|
||||
pub chunk_rrf_k: Option<f32>,
|
||||
|
||||
/// Weight for vector ranks in revised RRF
|
||||
#[arg(long)]
|
||||
pub chunk_rrf_vector_weight: Option<f32>,
|
||||
|
||||
/// Weight for chunk FTS ranks in revised RRF
|
||||
#[arg(long)]
|
||||
pub chunk_rrf_fts_weight: Option<f32>,
|
||||
|
||||
/// Include vector ranks in revised RRF (default: true)
|
||||
#[arg(long)]
|
||||
pub chunk_rrf_use_vector: Option<bool>,
|
||||
|
||||
/// Include chunk FTS ranks in revised RRF (default: true)
|
||||
#[arg(long)]
|
||||
pub chunk_rrf_use_fts: Option<bool>,
|
||||
|
||||
/// Require verified chunks (disable with --llm-mode)
|
||||
#[arg(skip = true)]
|
||||
pub require_verified_chunks: bool,
|
||||
@@ -104,6 +124,11 @@ impl Default for RetrievalSettings {
|
||||
rerank_pool_size: 4,
|
||||
rerank_keep_top: 10,
|
||||
chunk_result_cap: 5,
|
||||
chunk_rrf_k: None,
|
||||
chunk_rrf_vector_weight: None,
|
||||
chunk_rrf_fts_weight: None,
|
||||
chunk_rrf_use_vector: None,
|
||||
chunk_rrf_use_fts: None,
|
||||
require_verified_chunks: true,
|
||||
strategy: RetrievalStrategy::Initial,
|
||||
}
|
||||
@@ -376,6 +401,28 @@ impl Config {
|
||||
));
|
||||
}
|
||||
|
||||
if let Some(k) = self.retrieval.chunk_rrf_k {
|
||||
if k <= 0.0 || !k.is_finite() {
|
||||
return Err(anyhow!(
|
||||
"--chunk-rrf-k must be a positive, finite number (got {k})"
|
||||
));
|
||||
}
|
||||
}
|
||||
if let Some(weight) = self.retrieval.chunk_rrf_vector_weight {
|
||||
if weight < 0.0 || !weight.is_finite() {
|
||||
return Err(anyhow!(
|
||||
"--chunk-rrf-vector-weight must be a non-negative, finite number (got {weight})"
|
||||
));
|
||||
}
|
||||
}
|
||||
if let Some(weight) = self.retrieval.chunk_rrf_fts_weight {
|
||||
if weight < 0.0 || !weight.is_finite() {
|
||||
return Err(anyhow!(
|
||||
"--chunk-rrf-fts-weight must be a non-negative, finite number (got {weight})"
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
if self.concurrency == 0 {
|
||||
return Err(anyhow!("--concurrency must be greater than zero"));
|
||||
}
|
||||
|
||||
@@ -262,6 +262,10 @@ pub enum DatasetKind {
|
||||
Quora,
|
||||
#[value(name = "trec-covid", alias = "treccovid", alias = "trec_covid")]
|
||||
TrecCovid,
|
||||
#[value(name = "scifact")]
|
||||
Scifact,
|
||||
#[value(name = "nq-beir", alias = "natural-questions-beir")]
|
||||
NqBeir,
|
||||
}
|
||||
|
||||
impl DatasetKind {
|
||||
@@ -276,6 +280,8 @@ impl DatasetKind {
|
||||
Self::Nfcorpus => "nfcorpus",
|
||||
Self::Quora => "quora",
|
||||
Self::TrecCovid => "trec-covid",
|
||||
Self::Scifact => "scifact",
|
||||
Self::NqBeir => "nq-beir",
|
||||
}
|
||||
}
|
||||
|
||||
@@ -290,6 +296,8 @@ impl DatasetKind {
|
||||
Self::Nfcorpus => "NFCorpus (BEIR)",
|
||||
Self::Quora => "Quora (IR)",
|
||||
Self::TrecCovid => "TREC-COVID (BEIR)",
|
||||
Self::Scifact => "SciFact (BEIR)",
|
||||
Self::NqBeir => "Natural Questions (BEIR)",
|
||||
}
|
||||
}
|
||||
|
||||
@@ -304,6 +312,8 @@ impl DatasetKind {
|
||||
Self::Nfcorpus => "NFCorpus",
|
||||
Self::Quora => "Quora",
|
||||
Self::TrecCovid => "TREC-COVID",
|
||||
Self::Scifact => "SciFact",
|
||||
Self::NqBeir => "Natural Questions",
|
||||
}
|
||||
}
|
||||
|
||||
@@ -318,6 +328,8 @@ impl DatasetKind {
|
||||
Self::Nfcorpus => "NFCorpus",
|
||||
Self::Quora => "Quora",
|
||||
Self::TrecCovid => "TREC-COVID",
|
||||
Self::Scifact => "SciFact",
|
||||
Self::NqBeir => "Natural Questions",
|
||||
}
|
||||
}
|
||||
|
||||
@@ -332,6 +344,8 @@ impl DatasetKind {
|
||||
Self::Nfcorpus => "nfcorpus",
|
||||
Self::Quora => "quora",
|
||||
Self::TrecCovid => "trec-covid",
|
||||
Self::Scifact => "scifact",
|
||||
Self::NqBeir => "nq-beir",
|
||||
}
|
||||
}
|
||||
|
||||
@@ -376,20 +390,24 @@ impl FromStr for DatasetKind {
|
||||
"nfcorpus" | "nf-corpus" => Ok(Self::Nfcorpus),
|
||||
"quora" => Ok(Self::Quora),
|
||||
"trec-covid" | "treccovid" | "trec_covid" => Ok(Self::TrecCovid),
|
||||
"scifact" => Ok(Self::Scifact),
|
||||
"nq-beir" | "natural-questions-beir" => Ok(Self::NqBeir),
|
||||
other => {
|
||||
anyhow::bail!("unknown dataset '{other}'. Expected one of: squad, natural-questions, beir, fever, fiqa, hotpotqa, nfcorpus, quora, trec-covid.")
|
||||
anyhow::bail!("unknown dataset '{other}'. Expected one of: squad, natural-questions, beir, fever, fiqa, hotpotqa, nfcorpus, quora, trec-covid, scifact, nq-beir.")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub const BEIR_DATASETS: [DatasetKind; 6] = [
|
||||
pub const BEIR_DATASETS: [DatasetKind; 8] = [
|
||||
DatasetKind::Fever,
|
||||
DatasetKind::Fiqa,
|
||||
DatasetKind::HotpotQa,
|
||||
DatasetKind::Nfcorpus,
|
||||
DatasetKind::Quora,
|
||||
DatasetKind::TrecCovid,
|
||||
DatasetKind::Scifact,
|
||||
DatasetKind::NqBeir,
|
||||
];
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
@@ -481,7 +499,9 @@ pub fn convert(
|
||||
| DatasetKind::HotpotQa
|
||||
| DatasetKind::Nfcorpus
|
||||
| DatasetKind::Quora
|
||||
| DatasetKind::TrecCovid => beir::convert_beir(raw_path, dataset)?,
|
||||
| DatasetKind::TrecCovid
|
||||
| DatasetKind::Scifact
|
||||
| DatasetKind::NqBeir => beir::convert_beir(raw_path, dataset)?,
|
||||
};
|
||||
|
||||
let metadata_limit = match dataset {
|
||||
@@ -489,13 +509,26 @@ pub fn convert(
|
||||
_ => context_token_limit,
|
||||
};
|
||||
|
||||
let generated_at = match dataset {
|
||||
DatasetKind::Beir
|
||||
| DatasetKind::Fever
|
||||
| DatasetKind::Fiqa
|
||||
| DatasetKind::HotpotQa
|
||||
| DatasetKind::Nfcorpus
|
||||
| DatasetKind::Quora
|
||||
| DatasetKind::TrecCovid
|
||||
| DatasetKind::Scifact
|
||||
| DatasetKind::NqBeir => base_timestamp(),
|
||||
_ => Utc::now(),
|
||||
};
|
||||
|
||||
let source_label = match dataset {
|
||||
DatasetKind::Beir => "beir-mix".to_string(),
|
||||
_ => raw_path.display().to_string(),
|
||||
};
|
||||
|
||||
Ok(ConvertedDataset {
|
||||
generated_at: Utc::now(),
|
||||
generated_at,
|
||||
metadata: DatasetMetadata::for_kind(dataset, include_unanswerable, metadata_limit),
|
||||
source: source_label,
|
||||
paragraphs,
|
||||
|
||||
@@ -63,6 +63,21 @@ pub(crate) async fn run_queries(
|
||||
if let Some(value) = config.retrieval.chunk_fts_take {
|
||||
retrieval_config.tuning.chunk_fts_take = value;
|
||||
}
|
||||
if let Some(value) = config.retrieval.chunk_rrf_k {
|
||||
retrieval_config.tuning.chunk_rrf_k = value;
|
||||
}
|
||||
if let Some(value) = config.retrieval.chunk_rrf_vector_weight {
|
||||
retrieval_config.tuning.chunk_rrf_vector_weight = value;
|
||||
}
|
||||
if let Some(value) = config.retrieval.chunk_rrf_fts_weight {
|
||||
retrieval_config.tuning.chunk_rrf_fts_weight = value;
|
||||
}
|
||||
if let Some(value) = config.retrieval.chunk_rrf_use_vector {
|
||||
retrieval_config.tuning.chunk_rrf_use_vector = value;
|
||||
}
|
||||
if let Some(value) = config.retrieval.chunk_rrf_use_fts {
|
||||
retrieval_config.tuning.chunk_rrf_use_fts = value;
|
||||
}
|
||||
if let Some(value) = config.retrieval.chunk_avg_chars_per_token {
|
||||
retrieval_config.tuning.avg_chars_per_token = value;
|
||||
}
|
||||
@@ -93,6 +108,11 @@ pub(crate) async fn run_queries(
|
||||
rerank_keep_top = config.retrieval.rerank_keep_top,
|
||||
chunk_vector_take = effective_chunk_vector,
|
||||
chunk_fts_take = effective_chunk_fts,
|
||||
chunk_rrf_k = active_tuning.chunk_rrf_k,
|
||||
chunk_rrf_vector_weight = active_tuning.chunk_rrf_vector_weight,
|
||||
chunk_rrf_fts_weight = active_tuning.chunk_rrf_fts_weight,
|
||||
chunk_rrf_use_vector = active_tuning.chunk_rrf_use_vector,
|
||||
chunk_rrf_use_fts = active_tuning.chunk_rrf_use_fts,
|
||||
embedding_backend = ctx.embedding_provider().backend_label(),
|
||||
embedding_model = ctx
|
||||
.embedding_provider()
|
||||
|
||||
@@ -202,6 +202,11 @@ pub(crate) async fn summarize(
|
||||
detailed_report: config.detailed_report,
|
||||
retrieval_strategy: config.retrieval.strategy.to_string(),
|
||||
chunk_result_cap: config.retrieval.chunk_result_cap,
|
||||
chunk_rrf_k: active_tuning.chunk_rrf_k,
|
||||
chunk_rrf_vector_weight: active_tuning.chunk_rrf_vector_weight,
|
||||
chunk_rrf_fts_weight: active_tuning.chunk_rrf_fts_weight,
|
||||
chunk_rrf_use_vector: active_tuning.chunk_rrf_use_vector,
|
||||
chunk_rrf_use_fts: active_tuning.chunk_rrf_use_fts,
|
||||
ingest_chunk_min_tokens: config.ingest_chunk_min_tokens,
|
||||
ingest_chunk_max_tokens: config.ingest_chunk_max_tokens,
|
||||
ingest_chunks_only: config.ingest_chunks_only,
|
||||
|
||||
@@ -70,6 +70,11 @@ pub struct EvaluationSummary {
|
||||
pub detailed_report: bool,
|
||||
pub retrieval_strategy: String,
|
||||
pub chunk_result_cap: usize,
|
||||
pub chunk_rrf_k: f32,
|
||||
pub chunk_rrf_vector_weight: f32,
|
||||
pub chunk_rrf_fts_weight: f32,
|
||||
pub chunk_rrf_use_vector: bool,
|
||||
pub chunk_rrf_use_fts: bool,
|
||||
pub ingest_chunk_min_tokens: usize,
|
||||
pub ingest_chunk_max_tokens: usize,
|
||||
pub ingest_chunks_only: bool,
|
||||
|
||||
@@ -373,6 +373,20 @@ pub async fn ensure_corpus(
|
||||
let reused_ingestion = ingested_count == 0 && !cache.force_refresh;
|
||||
let reused_embeddings = reused_ingestion && !cache.refresh_embeddings_only;
|
||||
|
||||
info!(
|
||||
dataset = %dataset.metadata.id,
|
||||
slice = %slice.manifest.slice_id,
|
||||
fingerprint = %ingestion_fingerprint,
|
||||
reused_ingestion,
|
||||
reused_embeddings,
|
||||
positive_reused = stats.positive_reused,
|
||||
positive_ingested = stats.positive_ingested,
|
||||
negative_reused = stats.negative_reused,
|
||||
negative_ingested = stats.negative_ingested,
|
||||
shard_dir = %base_dir.display(),
|
||||
"Corpus cache outcome"
|
||||
);
|
||||
|
||||
let handle = CorpusHandle {
|
||||
manifest,
|
||||
path: base_dir,
|
||||
|
||||
@@ -22,7 +22,7 @@ use common::storage::{
|
||||
use serde::Deserialize;
|
||||
use serde::Serialize;
|
||||
use surrealdb::sql::Thing;
|
||||
use tracing::warn;
|
||||
use tracing::{debug, warn};
|
||||
|
||||
use crate::datasets::{ConvertedParagraph, ConvertedQuestion};
|
||||
|
||||
@@ -440,6 +440,12 @@ impl ParagraphShardStore {
|
||||
.with_context(|| format!("parsing shard {}", path.display()))?;
|
||||
|
||||
if shard.ingestion_fingerprint != fingerprint {
|
||||
debug!(
|
||||
path = %path.display(),
|
||||
expected = fingerprint,
|
||||
found = shard.ingestion_fingerprint,
|
||||
"Shard fingerprint mismatch; will rebuild"
|
||||
);
|
||||
return Ok(None);
|
||||
}
|
||||
if shard.version != PARAGRAPH_SHARD_VERSION {
|
||||
|
||||
@@ -197,6 +197,11 @@ mod tests {
|
||||
detailed_report: false,
|
||||
retrieval_strategy: "initial".into(),
|
||||
chunk_result_cap: 5,
|
||||
chunk_rrf_k: 60.0,
|
||||
chunk_rrf_vector_weight: 1.0,
|
||||
chunk_rrf_fts_weight: 1.0,
|
||||
chunk_rrf_use_vector: true,
|
||||
chunk_rrf_use_fts: true,
|
||||
ingest_chunk_min_tokens: 256,
|
||||
ingest_chunk_max_tokens: 512,
|
||||
ingest_chunks_only: false,
|
||||
|
||||
@@ -88,6 +88,16 @@ pub struct RetrievalSection {
|
||||
pub rerank_pool_size: Option<usize>,
|
||||
pub rerank_keep_top: usize,
|
||||
pub chunk_result_cap: usize,
|
||||
#[serde(default = "default_chunk_rrf_k")]
|
||||
pub chunk_rrf_k: f32,
|
||||
#[serde(default = "default_chunk_rrf_weight")]
|
||||
pub chunk_rrf_vector_weight: f32,
|
||||
#[serde(default = "default_chunk_rrf_weight")]
|
||||
pub chunk_rrf_fts_weight: f32,
|
||||
#[serde(default = "default_chunk_rrf_use")]
|
||||
pub chunk_rrf_use_vector: bool,
|
||||
#[serde(default = "default_chunk_rrf_use")]
|
||||
pub chunk_rrf_use_fts: bool,
|
||||
#[serde(default)]
|
||||
pub chunk_vector_take: usize,
|
||||
#[serde(default)]
|
||||
@@ -98,6 +108,18 @@ pub struct RetrievalSection {
|
||||
pub ingest_chunks_only: bool,
|
||||
}
|
||||
|
||||
const fn default_chunk_rrf_k() -> f32 {
|
||||
60.0
|
||||
}
|
||||
|
||||
const fn default_chunk_rrf_weight() -> f32 {
|
||||
1.0
|
||||
}
|
||||
|
||||
const fn default_chunk_rrf_use() -> bool {
|
||||
true
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct LlmSection {
|
||||
pub cases: usize,
|
||||
@@ -206,6 +228,11 @@ impl EvaluationReport {
|
||||
rerank_pool_size: summary.rerank_pool_size,
|
||||
rerank_keep_top: summary.rerank_keep_top,
|
||||
chunk_result_cap: summary.chunk_result_cap,
|
||||
chunk_rrf_k: summary.chunk_rrf_k,
|
||||
chunk_rrf_vector_weight: summary.chunk_rrf_vector_weight,
|
||||
chunk_rrf_fts_weight: summary.chunk_rrf_fts_weight,
|
||||
chunk_rrf_use_vector: summary.chunk_rrf_use_vector,
|
||||
chunk_rrf_use_fts: summary.chunk_rrf_use_fts,
|
||||
chunk_vector_take: summary.chunk_vector_take,
|
||||
chunk_fts_take: summary.chunk_fts_take,
|
||||
ingest_chunk_min_tokens: summary.ingest_chunk_min_tokens,
|
||||
@@ -856,6 +883,11 @@ fn convert_legacy_entry(entry: LegacyHistoryEntry) -> EvaluationReport {
|
||||
rerank_pool_size: entry.rerank_pool_size,
|
||||
rerank_keep_top: entry.rerank_keep_top,
|
||||
chunk_result_cap: entry.chunk_result_cap.unwrap_or(5),
|
||||
chunk_rrf_k: default_chunk_rrf_k(),
|
||||
chunk_rrf_vector_weight: default_chunk_rrf_weight(),
|
||||
chunk_rrf_fts_weight: default_chunk_rrf_weight(),
|
||||
chunk_rrf_use_vector: default_chunk_rrf_use(),
|
||||
chunk_rrf_use_fts: default_chunk_rrf_use(),
|
||||
chunk_vector_take: 0,
|
||||
chunk_fts_take: 0,
|
||||
ingest_chunk_min_tokens: entry.ingest_chunk_min_tokens.unwrap_or(256),
|
||||
@@ -1098,6 +1130,11 @@ mod tests {
|
||||
detailed_report: true,
|
||||
retrieval_strategy: "initial".into(),
|
||||
chunk_result_cap: 5,
|
||||
chunk_rrf_k: 60.0,
|
||||
chunk_rrf_vector_weight: 1.0,
|
||||
chunk_rrf_fts_weight: 1.0,
|
||||
chunk_rrf_use_vector: true,
|
||||
chunk_rrf_use_fts: true,
|
||||
ingest_chunk_min_tokens: 256,
|
||||
ingest_chunk_max_tokens: 512,
|
||||
ingest_chunk_overlap_tokens: 50,
|
||||
|
||||
Reference in New Issue
Block a user