mirror of
https://github.com/perstarkse/minne.git
synced 2026-07-04 03:51:43 +02:00
chore: git-hooks rustfmt and clippy
This commit is contained in:
+3
-4
@@ -2,10 +2,9 @@
|
|||||||
|
|
||||||
## Unreleased
|
## Unreleased
|
||||||
|
|
||||||
- Refactor: deduplicated test database setup across common/src/storage/ types by routing remaining inline SurrealDbClient::memory() calls through shared setup_test_db(), prepare_text_chunk_test_db(), and prepare_knowledge_entity_test_db() helpers; removed redundant apply_migrations() calls after setup_test_db() and collapsed configure_embedding_dimension + redefine_hnsw_index triplication into prepare_**test_db helpers; extracted generic ensure_fts_index helper for FTS index bootstrap replacing duplicated per-table ensure**_fts_indexes helpers
|
- Fix: added pre-commit hooks to further maintain code consistency.
|
||||||
- Refactor: split knowledge-graph.js monolith into focused functions (loadGraphData, buildSvg, createSimulation, drawLinks/Nodes/Labels, createHighlighting, createZoom, attachResize); fixed dead duplicate zoom instance
|
- Refactor: deduplicated test database setup across common/src/storage/.
|
||||||
- Refactor: extracted rubberbanding scroll logic in design-polish.js into standalone attachRubberbanding helper; removed dead pullDistance state
|
- Refactor: split knowledge-graph.js monolith into focused functions.
|
||||||
|
|
||||||
- Evaluations: simplified crate layout — linear pipeline, sharded-only converted store, in-memory ingestion, `db/` and `cli/` modules; namespace reuse state in corpus manifest (removed `cache/snapshots/`); no legacy JSON/history compatibility (re-run `--warm` after upgrade)
|
- Evaluations: simplified crate layout — linear pipeline, sharded-only converted store, in-memory ingestion, `db/` and `cli/` modules; namespace reuse state in corpus manifest (removed `cache/snapshots/`); no legacy JSON/history compatibility (re-run `--warm` after upgrade)
|
||||||
- Performance: ingestion skips per-task index rebuild; worker runs scheduled `REBUILD INDEX` (default every 24h via `index_rebuild_interval_secs`, `0` disables)
|
- Performance: ingestion skips per-task index rebuild; worker runs scheduled `REBUILD INDEX` (default every 24h via `index_rebuild_interval_secs`, `0` disables)
|
||||||
- Performance: ingestion persists all artifacts in a single SurrealDB transaction per task (atomic replace by task id)
|
- Performance: ingestion persists all artifacts in a single SurrealDB transaction per task (atomic replace by task id)
|
||||||
|
|||||||
+14
-14
@@ -3,10 +3,10 @@
|
|||||||
"devenv": {
|
"devenv": {
|
||||||
"locked": {
|
"locked": {
|
||||||
"dir": "src/modules",
|
"dir": "src/modules",
|
||||||
"lastModified": 1771066302,
|
"lastModified": 1781800860,
|
||||||
"owner": "cachix",
|
"owner": "cachix",
|
||||||
"repo": "devenv",
|
"repo": "devenv",
|
||||||
"rev": "1b355dec9bddbaddbe4966d6fc30d7aa3af8575b",
|
"rev": "d59d872d80876d9eeb3e214d3b088bc4a14a9c4f",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
@@ -22,10 +22,10 @@
|
|||||||
"rust-analyzer-src": "rust-analyzer-src"
|
"rust-analyzer-src": "rust-analyzer-src"
|
||||||
},
|
},
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1771052630,
|
"lastModified": 1781779700,
|
||||||
"owner": "nix-community",
|
"owner": "nix-community",
|
||||||
"repo": "fenix",
|
"repo": "fenix",
|
||||||
"rev": "d0555da98576b8611c25df0c208e51e9a182d95f",
|
"rev": "ad30e585c7a2917325943c2b19511f5a249eff53",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
@@ -58,10 +58,10 @@
|
|||||||
]
|
]
|
||||||
},
|
},
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1770726378,
|
"lastModified": 1781733627,
|
||||||
"owner": "cachix",
|
"owner": "cachix",
|
||||||
"repo": "git-hooks.nix",
|
"repo": "git-hooks.nix",
|
||||||
"rev": "5eaaedde414f6eb1aea8b8525c466dc37bba95ae",
|
"rev": "3bbec39bc90eadfa031e6f3b77272f3f60803e39",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
@@ -92,10 +92,10 @@
|
|||||||
},
|
},
|
||||||
"nixpkgs": {
|
"nixpkgs": {
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1771008912,
|
"lastModified": 1781577229,
|
||||||
"owner": "nixos",
|
"owner": "nixos",
|
||||||
"repo": "nixpkgs",
|
"repo": "nixpkgs",
|
||||||
"rev": "a82ccc39b39b621151d6732718e3e250109076fa",
|
"rev": "567a49d1913ce81ac6e9582e3553dd90a955875f",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
@@ -107,10 +107,10 @@
|
|||||||
},
|
},
|
||||||
"nixpkgs_2": {
|
"nixpkgs_2": {
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1770843696,
|
"lastModified": 1781607440,
|
||||||
"owner": "nixos",
|
"owner": "nixos",
|
||||||
"repo": "nixpkgs",
|
"repo": "nixpkgs",
|
||||||
"rev": "2343bbb58f99267223bc2aac4fc9ea301a155a16",
|
"rev": "3e41b24abd260e8f71dbe2f5737d24122f972158",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
@@ -135,10 +135,10 @@
|
|||||||
"rust-analyzer-src": {
|
"rust-analyzer-src": {
|
||||||
"flake": false,
|
"flake": false,
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1771007332,
|
"lastModified": 1781714865,
|
||||||
"owner": "rust-lang",
|
"owner": "rust-lang",
|
||||||
"repo": "rust-analyzer",
|
"repo": "rust-analyzer",
|
||||||
"rev": "bbc84d335fbbd9b3099d3e40c7469ee57dbd1873",
|
"rev": "abb1301c3c14a40645bb2588b1cc858fe374b527",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
@@ -155,10 +155,10 @@
|
|||||||
]
|
]
|
||||||
},
|
},
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1771038269,
|
"lastModified": 1781850613,
|
||||||
"owner": "oxalica",
|
"owner": "oxalica",
|
||||||
"repo": "rust-overlay",
|
"repo": "rust-overlay",
|
||||||
"rev": "d7a86c8a4df49002446737603a3e0d7ef91a9637",
|
"rev": "4baecb43a008cd004e5220a777e1724bd8d43e43",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
|
|||||||
+17
-4
@@ -4,19 +4,32 @@
|
|||||||
config,
|
config,
|
||||||
inputs,
|
inputs,
|
||||||
...
|
...
|
||||||
}:
|
}: let
|
||||||
let
|
|
||||||
ortVersion = lib.removeSuffix "\n" (builtins.readFile "${toString ./.}/ort-version");
|
ortVersion = lib.removeSuffix "\n" (builtins.readFile "${toString ./.}/ort-version");
|
||||||
_ortVersionCheck =
|
_ortVersionCheck =
|
||||||
if pkgs.onnxruntime.version == ortVersion
|
if pkgs.onnxruntime.version == ortVersion
|
||||||
then null
|
then null
|
||||||
else
|
else throw "pkgs.onnxruntime.version (${pkgs.onnxruntime.version}) must match ort-version (${ortVersion})";
|
||||||
throw "pkgs.onnxruntime.version (${pkgs.onnxruntime.version}) must match ort-version (${ortVersion})";
|
|
||||||
in {
|
in {
|
||||||
devenv.warnOnNewVersion = false;
|
devenv.warnOnNewVersion = false;
|
||||||
|
|
||||||
cachix.enable = false;
|
cachix.enable = false;
|
||||||
|
|
||||||
|
git-hooks.install.enable = true;
|
||||||
|
git-hooks.hooks = {
|
||||||
|
rustfmt.enable = true;
|
||||||
|
clippy = {
|
||||||
|
enable = true;
|
||||||
|
settings.allFeatures = true;
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
# Use pinned Rust toolchain from languages.rust for git-hooks wrappers
|
||||||
|
# (git-hooks.nix defaults to nixpkgs's cargo/clippy/rustfmt, ignoring the pin)
|
||||||
|
git-hooks.tools.cargo = lib.mkDefault config.languages.rust.toolchain.cargo;
|
||||||
|
git-hooks.tools.clippy = lib.mkDefault config.languages.rust.toolchain.clippy;
|
||||||
|
git-hooks.tools.rustfmt = lib.mkDefault config.languages.rust.toolchain.rustfmt;
|
||||||
|
|
||||||
packages = [
|
packages = [
|
||||||
pkgs.openssl
|
pkgs.openssl
|
||||||
pkgs.nodejs
|
pkgs.nodejs
|
||||||
|
|||||||
@@ -9,3 +9,6 @@ inputs:
|
|||||||
nixpkgs:
|
nixpkgs:
|
||||||
follows: nixpkgs
|
follows: nixpkgs
|
||||||
allowUnfree: true
|
allowUnfree: true
|
||||||
|
nixpkgs:
|
||||||
|
permittedInsecurePackages:
|
||||||
|
- "minio-2025-10-15T17-29-55Z"
|
||||||
|
|||||||
@@ -103,10 +103,16 @@ pub async fn collect_status(config: &Config) -> Result<EvalStatus> {
|
|||||||
ready: slice_manifest
|
ready: slice_manifest
|
||||||
.as_ref()
|
.as_ref()
|
||||||
.is_some_and(|manifest| slice::manifest_is_complete(manifest, &slice_config)),
|
.is_some_and(|manifest| slice::manifest_is_complete(manifest, &slice_config)),
|
||||||
path: manifest_path.as_ref().map(|path| path.display().to_string()),
|
path: manifest_path
|
||||||
|
.as_ref()
|
||||||
|
.map(|path| path.display().to_string()),
|
||||||
cases: slice_manifest.as_ref().map(|manifest| manifest.case_count),
|
cases: slice_manifest.as_ref().map(|manifest| manifest.case_count),
|
||||||
positives: slice_manifest.as_ref().map(|manifest| manifest.positive_paragraphs),
|
positives: slice_manifest
|
||||||
negatives: slice_manifest.as_ref().map(|manifest| manifest.negative_paragraphs),
|
.as_ref()
|
||||||
|
.map(|manifest| manifest.positive_paragraphs),
|
||||||
|
negatives: slice_manifest
|
||||||
|
.as_ref()
|
||||||
|
.map(|manifest| manifest.negative_paragraphs),
|
||||||
};
|
};
|
||||||
|
|
||||||
let beir_paragraph_ids = slice_manifest.as_ref().map(|manifest| {
|
let beir_paragraph_ids = slice_manifest.as_ref().map(|manifest| {
|
||||||
@@ -159,17 +165,9 @@ pub async fn collect_status(config: &Config) -> Result<EvalStatus> {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
let namespace = config
|
let namespace = config.database.db_namespace.clone().unwrap_or_else(|| {
|
||||||
.database
|
default_namespace(config.dataset.id(), config.limit, config.slice.as_deref())
|
||||||
.db_namespace
|
});
|
||||||
.clone()
|
|
||||||
.unwrap_or_else(|| {
|
|
||||||
default_namespace(
|
|
||||||
config.dataset.id(),
|
|
||||||
config.limit,
|
|
||||||
config.slice.as_deref(),
|
|
||||||
)
|
|
||||||
});
|
|
||||||
let database = config
|
let database = config
|
||||||
.database
|
.database
|
||||||
.db_database
|
.db_database
|
||||||
@@ -183,16 +181,17 @@ pub async fn collect_status(config: &Config) -> Result<EvalStatus> {
|
|||||||
.and_then(|manifest| manifest.metadata.namespace_seed)
|
.and_then(|manifest| manifest.metadata.namespace_seed)
|
||||||
});
|
});
|
||||||
|
|
||||||
let (seeded, namespace_seed_recorded) = match connect_eval_db(config, &namespace, &database).await {
|
let (seeded, namespace_seed_recorded) =
|
||||||
Ok(db) => {
|
match connect_eval_db(config, &namespace, &database).await {
|
||||||
let has_corpus = namespace_has_corpus(&db).await.unwrap_or(false);
|
Ok(db) => {
|
||||||
(has_corpus, namespace_seed.is_some())
|
let has_corpus = namespace_has_corpus(&db).await.unwrap_or(false);
|
||||||
}
|
(has_corpus, namespace_seed.is_some())
|
||||||
Err(err) => {
|
}
|
||||||
notes.push(format!("SurrealDB unavailable: {err}"));
|
Err(err) => {
|
||||||
(false, false)
|
notes.push(format!("SurrealDB unavailable: {err}"));
|
||||||
}
|
(false, false)
|
||||||
};
|
}
|
||||||
|
};
|
||||||
|
|
||||||
let query_ready = converted_ready
|
let query_ready = converted_ready
|
||||||
&& slice_ledger.ready
|
&& slice_ledger.ready
|
||||||
@@ -281,11 +280,7 @@ pub fn print_status(status: &EvalStatus) {
|
|||||||
);
|
);
|
||||||
println!(
|
println!(
|
||||||
"Query-ready: {}",
|
"Query-ready: {}",
|
||||||
if status.query_ready {
|
if status.query_ready { "yes" } else { "no" }
|
||||||
"yes"
|
|
||||||
} else {
|
|
||||||
"no"
|
|
||||||
}
|
|
||||||
);
|
);
|
||||||
for note in &status.notes {
|
for note in &status.notes {
|
||||||
println!("Note: {note}");
|
println!("Note: {note}");
|
||||||
|
|||||||
@@ -701,10 +701,7 @@ mod tests {
|
|||||||
|
|
||||||
ConvertedDataset {
|
ConvertedDataset {
|
||||||
generated_at: Utc::now(),
|
generated_at: Utc::now(),
|
||||||
metadata: crate::datasets::DatasetMetadata::for_kind(
|
metadata: crate::datasets::DatasetMetadata::for_kind(DatasetKind::default(), false),
|
||||||
DatasetKind::default(),
|
|
||||||
false,
|
|
||||||
),
|
|
||||||
source: "src".to_string(),
|
source: "src".to_string(),
|
||||||
paragraphs: vec![paragraph],
|
paragraphs: vec![paragraph],
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -10,11 +10,8 @@ use chrono::{DateTime, Utc};
|
|||||||
use common::storage::{
|
use common::storage::{
|
||||||
db::SurrealDbClient,
|
db::SurrealDbClient,
|
||||||
types::{
|
types::{
|
||||||
knowledge_entity::KnowledgeEntity,
|
knowledge_entity::KnowledgeEntity, knowledge_relationship::KnowledgeRelationship,
|
||||||
knowledge_relationship::KnowledgeRelationship,
|
text_chunk::TextChunk, text_content::TextContent, StoredObject,
|
||||||
text_chunk::TextChunk,
|
|
||||||
text_content::TextContent,
|
|
||||||
StoredObject,
|
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
use ingestion_pipeline::{persist_artifacts, IngestionTuning, PipelineArtifacts};
|
use ingestion_pipeline::{persist_artifacts, IngestionTuning, PipelineArtifacts};
|
||||||
|
|||||||
@@ -190,9 +190,7 @@ pub fn convert_beir_documents(
|
|||||||
|
|
||||||
pub fn corpus_doc_id(paragraph_id: &str, dataset: DatasetKind) -> Option<String> {
|
pub fn corpus_doc_id(paragraph_id: &str, dataset: DatasetKind) -> Option<String> {
|
||||||
let prefix = format!("{}-", dataset.source_prefix());
|
let prefix = format!("{}-", dataset.source_prefix());
|
||||||
paragraph_id
|
paragraph_id.strip_prefix(&prefix).map(str::to_string)
|
||||||
.strip_prefix(&prefix)
|
|
||||||
.map(str::to_string)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn resolve_qrels_path(raw_dir: &Path) -> Result<PathBuf> {
|
fn resolve_qrels_path(raw_dir: &Path) -> Result<PathBuf> {
|
||||||
|
|||||||
@@ -11,12 +11,9 @@ use super::{
|
|||||||
self, build_dataset_from_catalog, paragraph_path, read_meta, store_dir_for,
|
self, build_dataset_from_catalog, paragraph_path, read_meta, store_dir_for,
|
||||||
upsert_sharded_paragraphs, write_sharded,
|
upsert_sharded_paragraphs, write_sharded,
|
||||||
},
|
},
|
||||||
BEIR_DATASETS, ConvertedDataset, DatasetKind, DatasetMetadata,
|
ConvertedDataset, DatasetKind, DatasetMetadata, BEIR_DATASETS,
|
||||||
};
|
|
||||||
use crate::{
|
|
||||||
args::Config,
|
|
||||||
slice,
|
|
||||||
};
|
};
|
||||||
|
use crate::{args::Config, slice};
|
||||||
|
|
||||||
pub fn subset_for_paragraph_id(paragraph_id: &str) -> Option<DatasetKind> {
|
pub fn subset_for_paragraph_id(paragraph_id: &str) -> Option<DatasetKind> {
|
||||||
let mut kinds: Vec<DatasetKind> = BEIR_DATASETS.to_vec();
|
let mut kinds: Vec<DatasetKind> = BEIR_DATASETS.to_vec();
|
||||||
@@ -53,9 +50,8 @@ pub fn build_beir_mix_qrels_dataset(include_unanswerable: bool) -> Result<Conver
|
|||||||
pub fn prepare_beir_mix(config: &Config) -> Result<super::loader::LoadedDataset> {
|
pub fn prepare_beir_mix(config: &Config) -> Result<super::loader::LoadedDataset> {
|
||||||
let virtual_ds = build_beir_mix_qrels_dataset(config.llm_mode)?;
|
let virtual_ds = build_beir_mix_qrels_dataset(config.llm_mode)?;
|
||||||
let slice_config = slice::slice_config_with_limit(config, slice::ledger_target(config));
|
let slice_config = slice::slice_config_with_limit(config, slice::ledger_target(config));
|
||||||
let resolved = slice::resolve_slice(&virtual_ds, &slice_config).context(
|
let resolved = slice::resolve_slice(&virtual_ds, &slice_config)
|
||||||
"resolving BEIR mix slice ledger (check --slice and --limit match your intent)",
|
.context("resolving BEIR mix slice ledger (check --slice and --limit match your intent)")?;
|
||||||
)?;
|
|
||||||
|
|
||||||
let unique: HashSet<String> = resolved
|
let unique: HashSet<String> = resolved
|
||||||
.manifest
|
.manifest
|
||||||
@@ -83,16 +79,16 @@ pub fn prepare_beir_mix(config: &Config) -> Result<super::loader::LoadedDataset>
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn materialize_subset_stores(
|
pub fn materialize_subset_stores(paragraph_ids: &HashSet<String>, force: bool) -> Result<()> {
|
||||||
paragraph_ids: &HashSet<String>,
|
|
||||||
force: bool,
|
|
||||||
) -> Result<()> {
|
|
||||||
let mut by_subset: HashMap<DatasetKind, Vec<String>> = HashMap::new();
|
let mut by_subset: HashMap<DatasetKind, Vec<String>> = HashMap::new();
|
||||||
for paragraph_id in paragraph_ids {
|
for paragraph_id in paragraph_ids {
|
||||||
let kind = subset_for_paragraph_id(paragraph_id).with_context(|| {
|
let kind = subset_for_paragraph_id(paragraph_id).with_context(|| {
|
||||||
format!("routing BEIR mix paragraph id '{paragraph_id}' to subset store")
|
format!("routing BEIR mix paragraph id '{paragraph_id}' to subset store")
|
||||||
})?;
|
})?;
|
||||||
by_subset.entry(kind).or_default().push(paragraph_id.clone());
|
by_subset
|
||||||
|
.entry(kind)
|
||||||
|
.or_default()
|
||||||
|
.push(paragraph_id.clone());
|
||||||
}
|
}
|
||||||
|
|
||||||
for (kind, ids) in by_subset {
|
for (kind, ids) in by_subset {
|
||||||
@@ -120,11 +116,7 @@ pub fn materialize_subset_stores(
|
|||||||
.iter()
|
.iter()
|
||||||
.filter_map(|paragraph_id| beir::corpus_doc_id(paragraph_id, kind))
|
.filter_map(|paragraph_id| beir::corpus_doc_id(paragraph_id, kind))
|
||||||
.collect();
|
.collect();
|
||||||
let paragraphs = beir::convert_beir_documents(
|
let paragraphs = beir::convert_beir_documents(&entry.raw_path, kind, Some(&corpus_ids))?;
|
||||||
&entry.raw_path,
|
|
||||||
kind,
|
|
||||||
Some(&corpus_ids),
|
|
||||||
)?;
|
|
||||||
|
|
||||||
if store_dir.join("meta.json").is_file() {
|
if store_dir.join("meta.json").is_file() {
|
||||||
upsert_sharded_paragraphs(&store_dir, ¶graphs)?;
|
upsert_sharded_paragraphs(&store_dir, ¶graphs)?;
|
||||||
@@ -233,7 +225,11 @@ pub fn beir_subset_store_summary() -> Result<Vec<(String, usize, usize)>> {
|
|||||||
let store_dir = store_dir_for(&entry.converted_path);
|
let store_dir = store_dir_for(&entry.converted_path);
|
||||||
if store_dir.join("meta.json").is_file() {
|
if store_dir.join("meta.json").is_file() {
|
||||||
let meta = read_meta(&store_dir)?;
|
let meta = read_meta(&store_dir)?;
|
||||||
summary.push((kind.id().to_string(), meta.paragraph_count, meta.question_count));
|
summary.push((
|
||||||
|
kind.id().to_string(),
|
||||||
|
meta.paragraph_count,
|
||||||
|
meta.question_count,
|
||||||
|
));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Ok(summary)
|
Ok(summary)
|
||||||
|
|||||||
@@ -56,8 +56,8 @@ impl ChecksumSidecar {
|
|||||||
|
|
||||||
#[allow(clippy::indexing_slicing)]
|
#[allow(clippy::indexing_slicing)]
|
||||||
pub fn hash_file(path: &Path) -> Result<String> {
|
pub fn hash_file(path: &Path) -> Result<String> {
|
||||||
let mut file =
|
let mut file = File::open(path)
|
||||||
File::open(path).with_context(|| format!("opening file {} for checksum", path.display()))?;
|
.with_context(|| format!("opening file {} for checksum", path.display()))?;
|
||||||
let mut hasher = Sha256::new();
|
let mut hasher = Sha256::new();
|
||||||
let mut buffer = vec![0u8; 65_536];
|
let mut buffer = vec![0u8; 65_536];
|
||||||
loop {
|
loop {
|
||||||
@@ -176,7 +176,10 @@ fn collect_store_files(base: &Path, current: &Path, entries: &mut Vec<String>) -
|
|||||||
for entry in fs::read_dir(current)? {
|
for entry in fs::read_dir(current)? {
|
||||||
let entry = entry?;
|
let entry = entry?;
|
||||||
let path = entry.path();
|
let path = entry.path();
|
||||||
if path.file_name().is_some_and(|name| name == "checksum.sha256") {
|
if path
|
||||||
|
.file_name()
|
||||||
|
.is_some_and(|name| name == "checksum.sha256")
|
||||||
|
{
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if path.is_dir() {
|
if path.is_dir() {
|
||||||
|
|||||||
@@ -186,9 +186,7 @@ fn slice_config_for_catalog_entry<'a>(
|
|||||||
limit: slice_entry.limit,
|
limit: slice_entry.limit,
|
||||||
corpus_limit: slice_entry.corpus_limit,
|
corpus_limit: slice_entry.corpus_limit,
|
||||||
slice_seed: slice_entry.seed.unwrap_or(config.slice_seed),
|
slice_seed: slice_entry.seed.unwrap_or(config.slice_seed),
|
||||||
llm_mode: slice_entry
|
llm_mode: slice_entry.include_unanswerable.unwrap_or(config.llm_mode),
|
||||||
.include_unanswerable
|
|
||||||
.unwrap_or(config.llm_mode),
|
|
||||||
negative_multiplier: slice_entry
|
negative_multiplier: slice_entry
|
||||||
.negative_multiplier
|
.negative_multiplier
|
||||||
.unwrap_or(config.negative_multiplier),
|
.unwrap_or(config.negative_multiplier),
|
||||||
|
|||||||
@@ -222,10 +222,8 @@ fn resolve_path(root: &Path, value: &str) -> PathBuf {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub use beir_mix::{beir_subset_store_summary, beir_subset_stores_ready, mix_content_checksum};
|
||||||
pub use checksum::store_aggregate_checksum;
|
pub use checksum::store_aggregate_checksum;
|
||||||
pub use beir_mix::{
|
|
||||||
beir_subset_store_summary, beir_subset_stores_ready, mix_content_checksum,
|
|
||||||
};
|
|
||||||
pub use loader::{prebuild_catalog_slices, prepare_dataset};
|
pub use loader::{prebuild_catalog_slices, prepare_dataset};
|
||||||
pub use store::{
|
pub use store::{
|
||||||
content_checksum_for_layout, detect_layout, store_dir_for, write_sharded, ConvertedLayout,
|
content_checksum_for_layout, detect_layout, store_dir_for, write_sharded, ConvertedLayout,
|
||||||
|
|||||||
@@ -11,8 +11,8 @@ use serde::{Deserialize, Serialize};
|
|||||||
use tracing::info;
|
use tracing::info;
|
||||||
|
|
||||||
use super::{
|
use super::{
|
||||||
checksum::store_aggregate_checksum,
|
checksum::store_aggregate_checksum, ConvertedDataset, ConvertedParagraph, ConvertedQuestion,
|
||||||
ConvertedDataset, ConvertedParagraph, ConvertedQuestion, DatasetMetadata,
|
DatasetMetadata,
|
||||||
};
|
};
|
||||||
use crate::slice;
|
use crate::slice;
|
||||||
|
|
||||||
@@ -50,11 +50,10 @@ pub fn store_dir_for(converted_path: &Path) -> PathBuf {
|
|||||||
converted_path
|
converted_path
|
||||||
.parent()
|
.parent()
|
||||||
.unwrap_or_else(|| Path::new("."))
|
.unwrap_or_else(|| Path::new("."))
|
||||||
.join(
|
.join(converted_path.file_stem().map_or_else(
|
||||||
converted_path
|
|| "dataset".to_string(),
|
||||||
.file_stem()
|
|stem| stem.to_string_lossy().into(),
|
||||||
.map_or_else(|| "dataset".to_string(), |stem| stem.to_string_lossy().into()),
|
))
|
||||||
)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn detect_layout(converted_path: &Path) -> ConvertedLayout {
|
pub fn detect_layout(converted_path: &Path) -> ConvertedLayout {
|
||||||
@@ -167,8 +166,8 @@ pub fn content_checksum_for_layout(converted_path: &Path) -> Result<String> {
|
|||||||
|
|
||||||
fn load_paragraph(store_dir: &Path, paragraph_id: &str) -> Result<ConvertedParagraph> {
|
fn load_paragraph(store_dir: &Path, paragraph_id: &str) -> Result<ConvertedParagraph> {
|
||||||
let path = paragraph_path(store_dir, paragraph_id);
|
let path = paragraph_path(store_dir, paragraph_id);
|
||||||
let raw = fs::read(&path)
|
let raw =
|
||||||
.with_context(|| format!("reading sharded paragraph {}", path.display()))?;
|
fs::read(&path).with_context(|| format!("reading sharded paragraph {}", path.display()))?;
|
||||||
serde_json::from_slice(&raw)
|
serde_json::from_slice(&raw)
|
||||||
.with_context(|| format!("parsing sharded paragraph {}", path.display()))
|
.with_context(|| format!("parsing sharded paragraph {}", path.display()))
|
||||||
}
|
}
|
||||||
@@ -180,7 +179,10 @@ fn load_paragraphs(store_dir: &Path, paragraph_ids: &[String]) -> Result<Vec<Con
|
|||||||
.collect()
|
.collect()
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn load_sharded_partial(store_dir: &Path, paragraph_ids: &[String]) -> Result<ConvertedDataset> {
|
pub fn load_sharded_partial(
|
||||||
|
store_dir: &Path,
|
||||||
|
paragraph_ids: &[String],
|
||||||
|
) -> Result<ConvertedDataset> {
|
||||||
let meta = read_meta(store_dir)?;
|
let meta = read_meta(store_dir)?;
|
||||||
let mut paragraphs = load_paragraphs(store_dir, paragraph_ids)?;
|
let mut paragraphs = load_paragraphs(store_dir, paragraph_ids)?;
|
||||||
paragraphs.sort_by(|left, right| left.id.cmp(&right.id));
|
paragraphs.sort_by(|left, right| left.id.cmp(&right.id));
|
||||||
@@ -333,8 +335,8 @@ pub fn load_question_catalog(store_dir: &Path) -> Result<QuestionCatalog> {
|
|||||||
if line.trim().is_empty() {
|
if line.trim().is_empty() {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
let record: QuestionRecord = serde_json::from_str(&line)
|
let record: QuestionRecord =
|
||||||
.context("parsing question catalog record")?;
|
serde_json::from_str(&line).context("parsing question catalog record")?;
|
||||||
entries.push(record);
|
entries.push(record);
|
||||||
}
|
}
|
||||||
Ok(QuestionCatalog { entries })
|
Ok(QuestionCatalog { entries })
|
||||||
|
|||||||
@@ -132,8 +132,7 @@ pub(crate) async fn can_reuse_namespace(
|
|||||||
if seed.namespace != namespace || seed.database != database {
|
if seed.namespace != namespace || seed.database != database {
|
||||||
info!(
|
info!(
|
||||||
namespace,
|
namespace,
|
||||||
database,
|
database, "Corpus manifest namespace metadata mismatch; reseeding"
|
||||||
"Corpus manifest namespace metadata mismatch; reseeding"
|
|
||||||
);
|
);
|
||||||
return Ok(false);
|
return Ok(false);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -5,5 +5,5 @@ pub(crate) use connect::{
|
|||||||
can_reuse_namespace, connect_eval_db, default_database, default_namespace, ensure_eval_user,
|
can_reuse_namespace, connect_eval_db, default_database, default_namespace, ensure_eval_user,
|
||||||
namespace_has_corpus, record_namespace_seed, sanitize_model_code,
|
namespace_has_corpus, record_namespace_seed, sanitize_model_code,
|
||||||
};
|
};
|
||||||
pub use lifecycle::{recreate_indexes, reset_namespace};
|
|
||||||
pub(crate) use lifecycle::warm_hnsw_cache;
|
pub(crate) use lifecycle::warm_hnsw_cache;
|
||||||
|
pub use lifecycle::{recreate_indexes, reset_namespace};
|
||||||
|
|||||||
@@ -1,8 +1,4 @@
|
|||||||
use std::{
|
use std::{collections::HashMap, fs, path::Path};
|
||||||
collections::HashMap,
|
|
||||||
fs,
|
|
||||||
path::Path,
|
|
||||||
};
|
|
||||||
|
|
||||||
use anyhow::{anyhow, Context, Result};
|
use anyhow::{anyhow, Context, Result};
|
||||||
use common::storage::{db::SurrealDbClient, types::text_chunk::TextChunk};
|
use common::storage::{db::SurrealDbClient, types::text_chunk::TextChunk};
|
||||||
@@ -72,9 +68,9 @@ pub async fn inspect_question(config: &Config) -> Result<()> {
|
|||||||
MissingChunks::None => println!(
|
MissingChunks::None => println!(
|
||||||
"All matching_chunk_ids exist in namespace '{ns}', database '{db_name}'"
|
"All matching_chunk_ids exist in namespace '{ns}', database '{db_name}'"
|
||||||
),
|
),
|
||||||
MissingChunks::Missing(list) => println!(
|
MissingChunks::Missing(list) => {
|
||||||
"Missing chunks in namespace '{ns}', database '{db_name}': {list:?}"
|
println!("Missing chunks in namespace '{ns}', database '{db_name}': {list:?}");
|
||||||
),
|
}
|
||||||
},
|
},
|
||||||
Err(err) => {
|
Err(err) => {
|
||||||
println!(
|
println!(
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
mod args;
|
mod args;
|
||||||
mod context_stats;
|
|
||||||
mod cases;
|
mod cases;
|
||||||
mod cli;
|
mod cli;
|
||||||
|
mod context_stats;
|
||||||
mod corpus;
|
mod corpus;
|
||||||
mod datasets;
|
mod datasets;
|
||||||
mod db;
|
mod db;
|
||||||
@@ -129,10 +129,7 @@ async fn async_main() -> anyhow::Result<()> {
|
|||||||
let store_dir = datasets::store_dir_for(&parsed.config.converted_dataset_path);
|
let store_dir = datasets::store_dir_for(&parsed.config.converted_dataset_path);
|
||||||
datasets::write_sharded(&dataset, &store_dir)?;
|
datasets::write_sharded(&dataset, &store_dir)?;
|
||||||
datasets::prebuild_catalog_slices(&dataset, &parsed.config)?;
|
datasets::prebuild_catalog_slices(&dataset, &parsed.config)?;
|
||||||
println!(
|
println!("Converted dataset written under {}", store_dir.display());
|
||||||
"Converted dataset written under {}",
|
|
||||||
store_dir.display()
|
|
||||||
);
|
|
||||||
return Ok(());
|
return Ok(());
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -141,14 +138,13 @@ async fn async_main() -> anyhow::Result<()> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
info!(dataset = dataset_kind.id(), "Preparing converted dataset");
|
info!(dataset = dataset_kind.id(), "Preparing converted dataset");
|
||||||
let loaded = crate::datasets::prepare_dataset(dataset_kind, &parsed.config).with_context(
|
let loaded =
|
||||||
|| {
|
crate::datasets::prepare_dataset(dataset_kind, &parsed.config).with_context(|| {
|
||||||
format!(
|
format!(
|
||||||
"preparing converted dataset at {}",
|
"preparing converted dataset at {}",
|
||||||
parsed.config.converted_dataset_path.display()
|
parsed.config.converted_dataset_path.display()
|
||||||
)
|
)
|
||||||
},
|
})?;
|
||||||
)?;
|
|
||||||
|
|
||||||
info!(
|
info!(
|
||||||
questions = loaded
|
questions = loaded
|
||||||
|
|||||||
@@ -14,10 +14,7 @@ pub fn ingestion_openai_client(
|
|||||||
)?;
|
)?;
|
||||||
Ok((Arc::new(client), Some(base_url)))
|
Ok((Arc::new(client), Some(base_url)))
|
||||||
} else {
|
} else {
|
||||||
Ok((
|
Ok((Arc::new(Client::with_config(OpenAIConfig::default())), None))
|
||||||
Arc::new(Client::with_config(OpenAIConfig::default())),
|
|
||||||
None,
|
|
||||||
))
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -91,7 +91,10 @@ fn format_duration(value: Option<u128>) -> String {
|
|||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::*;
|
use super::*;
|
||||||
use crate::types::{EvaluationStageTimings, PerformanceTimings, LatencyStats, StageLatency, StageLatencyBreakdown};
|
use crate::types::{
|
||||||
|
EvaluationStageTimings, LatencyStats, PerformanceTimings, StageLatency,
|
||||||
|
StageLatencyBreakdown,
|
||||||
|
};
|
||||||
use chrono::Utc;
|
use chrono::Utc;
|
||||||
use tempfile::tempdir;
|
use tempfile::tempdir;
|
||||||
|
|
||||||
|
|||||||
@@ -26,12 +26,7 @@ pub async fn warm_evaluation(
|
|||||||
config: &Config,
|
config: &Config,
|
||||||
content_checksum: &str,
|
content_checksum: &str,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
let _ctx = run_through_namespace(
|
let _ctx = run_through_namespace(dataset, config, Some(content_checksum.to_string())).await?;
|
||||||
dataset,
|
|
||||||
config,
|
|
||||||
Some(content_checksum.to_string()),
|
|
||||||
)
|
|
||||||
.await?;
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -40,11 +35,7 @@ pub async fn run_evaluation(
|
|||||||
config: &Config,
|
config: &Config,
|
||||||
content_checksum: Option<&str>,
|
content_checksum: Option<&str>,
|
||||||
) -> Result<EvaluationSummary> {
|
) -> Result<EvaluationSummary> {
|
||||||
let mut ctx = EvaluationContext::new(
|
let mut ctx = EvaluationContext::new(dataset, config, content_checksum.map(str::to_string));
|
||||||
dataset,
|
|
||||||
config,
|
|
||||||
content_checksum.map(str::to_string),
|
|
||||||
);
|
|
||||||
stages::prepare_slice(&mut ctx).await?;
|
stages::prepare_slice(&mut ctx).await?;
|
||||||
stages::prepare_db(&mut ctx).await?;
|
stages::prepare_db(&mut ctx).await?;
|
||||||
stages::prepare_corpus(&mut ctx).await?;
|
stages::prepare_corpus(&mut ctx).await?;
|
||||||
|
|||||||
@@ -3,7 +3,10 @@ use std::time::Instant;
|
|||||||
use anyhow::Context;
|
use anyhow::Context;
|
||||||
use tracing::info;
|
use tracing::info;
|
||||||
|
|
||||||
use crate::{db::{default_database, default_namespace}, slice};
|
use crate::{
|
||||||
|
db::{default_database, default_namespace},
|
||||||
|
slice,
|
||||||
|
};
|
||||||
|
|
||||||
use super::super::context::{EvalStage, EvaluationContext};
|
use super::super::context::{EvalStage, EvaluationContext};
|
||||||
|
|
||||||
|
|||||||
@@ -14,9 +14,7 @@ use tracing::{info, warn};
|
|||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
args::Config,
|
args::Config,
|
||||||
datasets::{
|
datasets::{ConvertedDataset, ConvertedParagraph, ConvertedQuestion, DatasetKind},
|
||||||
ConvertedDataset, ConvertedParagraph, ConvertedQuestion, DatasetKind,
|
|
||||||
},
|
|
||||||
};
|
};
|
||||||
|
|
||||||
mod beir;
|
mod beir;
|
||||||
@@ -244,8 +242,7 @@ pub fn resolve_slice<'a>(
|
|||||||
);
|
);
|
||||||
return Ok(resolved);
|
return Ok(resolved);
|
||||||
}
|
}
|
||||||
let resolved =
|
let resolved = materialize_slice_ledger(dataset, config, &index, slice_arg, path)?;
|
||||||
materialize_slice_ledger(dataset, config, &index, slice_arg, path)?;
|
|
||||||
info!(
|
info!(
|
||||||
slice = %resolved.manifest.slice_id,
|
slice = %resolved.manifest.slice_id,
|
||||||
path = %resolved.path.display(),
|
path = %resolved.path.display(),
|
||||||
@@ -927,10 +924,7 @@ pub fn cached_manifest_path(config: &crate::args::Config) -> Option<PathBuf> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn manifest_is_complete(manifest: &SliceManifest, config: &SliceConfig<'_>) -> bool {
|
pub fn manifest_is_complete(manifest: &SliceManifest, config: &SliceConfig<'_>) -> bool {
|
||||||
let requested_limit = config
|
let requested_limit = config.limit.unwrap_or(manifest.case_count.max(1)).max(1);
|
||||||
.limit
|
|
||||||
.unwrap_or(manifest.case_count.max(1))
|
|
||||||
.max(1);
|
|
||||||
if manifest.case_count < requested_limit {
|
if manifest.case_count < requested_limit {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@@ -942,7 +936,9 @@ pub fn manifest_is_complete(manifest: &SliceManifest, config: &SliceConfig<'_>)
|
|||||||
let desired_negatives = desired_negative_target(
|
let desired_negatives = desired_negative_target(
|
||||||
manifest.positive_paragraphs,
|
manifest.positive_paragraphs,
|
||||||
requested_corpus,
|
requested_corpus,
|
||||||
manifest.total_paragraphs.max(manifest.positive_paragraphs.max(1)),
|
manifest
|
||||||
|
.total_paragraphs
|
||||||
|
.max(manifest.positive_paragraphs.max(1)),
|
||||||
config.negative_multiplier,
|
config.negative_multiplier,
|
||||||
);
|
);
|
||||||
manifest.negative_paragraphs >= desired_negatives
|
manifest.negative_paragraphs >= desired_negatives
|
||||||
@@ -978,8 +974,7 @@ pub fn ledger_target(config: &Config) -> Option<usize> {
|
|||||||
pub fn grow_slice(dataset: &ConvertedDataset, config: &Config) -> Result<()> {
|
pub fn grow_slice(dataset: &ConvertedDataset, config: &Config) -> Result<()> {
|
||||||
let ledger_limit = ledger_target(config);
|
let ledger_limit = ledger_target(config);
|
||||||
let slice_settings = slice_config_with_limit(config, ledger_limit);
|
let slice_settings = slice_config_with_limit(config, ledger_limit);
|
||||||
let slice =
|
let slice = resolve_slice(dataset, &slice_settings).context("resolving dataset slice")?;
|
||||||
resolve_slice(dataset, &slice_settings).context("resolving dataset slice")?;
|
|
||||||
info!(
|
info!(
|
||||||
slice = slice.manifest.slice_id.as_str(),
|
slice = slice.manifest.slice_id.as_str(),
|
||||||
cases = slice.manifest.case_count,
|
cases = slice.manifest.case_count,
|
||||||
|
|||||||
@@ -109,10 +109,7 @@ impl<'a> PipelineContext<'a> {
|
|||||||
let content = self.take_text_content()?;
|
let content = self.take_text_content()?;
|
||||||
let analysis = self.take_analysis()?;
|
let analysis = self.take_analysis()?;
|
||||||
|
|
||||||
let (entities, relationships) = self
|
let (entities, relationships) = self.services.convert_analysis(&content, &analysis).await?;
|
||||||
.services
|
|
||||||
.convert_analysis(&content, &analysis)
|
|
||||||
.await?;
|
|
||||||
|
|
||||||
let chunk_range = self.chunk_token_range();
|
let chunk_range = self.chunk_token_range();
|
||||||
let chunk_overlap = self.chunk_overlap_tokens();
|
let chunk_overlap = self.chunk_overlap_tokens();
|
||||||
|
|||||||
@@ -186,11 +186,7 @@ impl IngestionPipeline {
|
|||||||
}
|
}
|
||||||
|
|
||||||
async fn artifacts_persisted(&self, task_id: &str) -> Result<bool, AppError> {
|
async fn artifacts_persisted(&self, task_id: &str) -> Result<bool, AppError> {
|
||||||
Ok(self
|
Ok(self.db.get_item::<TextContent>(task_id).await?.is_some())
|
||||||
.db
|
|
||||||
.get_item::<TextContent>(task_id)
|
|
||||||
.await?
|
|
||||||
.is_some())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn finalize_succeeded(&self, task: &IngestionTask) -> Result<(), AppError> {
|
async fn finalize_succeeded(&self, task: &IngestionTask) -> Result<(), AppError> {
|
||||||
@@ -379,8 +375,7 @@ mod finalize_tests {
|
|||||||
persist_max_backoff_ms: 10,
|
persist_max_backoff_ms: 10,
|
||||||
..IngestionTuning::default()
|
..IngestionTuning::default()
|
||||||
};
|
};
|
||||||
let pipeline =
|
let pipeline = IngestionPipeline::with_services(Arc::new(db.clone()), config, services)?;
|
||||||
IngestionPipeline::with_services(Arc::new(db.clone()), config, services)?;
|
|
||||||
|
|
||||||
let task = reserve_task(
|
let task = reserve_task(
|
||||||
&db,
|
&db,
|
||||||
@@ -397,9 +392,7 @@ mod finalize_tests {
|
|||||||
let processing = task.mark_processing(&db).await?;
|
let processing = task.mark_processing(&db).await?;
|
||||||
|
|
||||||
db.client
|
db.client
|
||||||
.query(
|
.query("UPDATE type::thing('ingestion_task', $id) SET worker_id = $wrong_worker;")
|
||||||
"UPDATE type::thing('ingestion_task', $id) SET worker_id = $wrong_worker;",
|
|
||||||
)
|
|
||||||
.bind(("id", processing.id.clone()))
|
.bind(("id", processing.id.clone()))
|
||||||
.bind(("wrong_worker", "wrong-worker"))
|
.bind(("wrong_worker", "wrong-worker"))
|
||||||
.await?;
|
.await?;
|
||||||
@@ -410,9 +403,7 @@ mod finalize_tests {
|
|||||||
sleep(Duration::from_millis(5)).await;
|
sleep(Duration::from_millis(5)).await;
|
||||||
let _ = db_fix
|
let _ = db_fix
|
||||||
.client
|
.client
|
||||||
.query(
|
.query("UPDATE type::thing('ingestion_task', $id) SET worker_id = $worker_id;")
|
||||||
"UPDATE type::thing('ingestion_task', $id) SET worker_id = $worker_id;",
|
|
||||||
)
|
|
||||||
.bind(("id", task_id))
|
.bind(("id", task_id))
|
||||||
.bind(("worker_id", worker_id))
|
.bind(("worker_id", worker_id))
|
||||||
.await;
|
.await;
|
||||||
@@ -420,10 +411,7 @@ mod finalize_tests {
|
|||||||
|
|
||||||
pipeline.finalize_succeeded(&processing).await?;
|
pipeline.finalize_succeeded(&processing).await?;
|
||||||
|
|
||||||
let stored: IngestionTask = db
|
let stored: IngestionTask = db.get_item(&processing.id).await?.context("task stored")?;
|
||||||
.get_item(&processing.id)
|
|
||||||
.await?
|
|
||||||
.context("task stored")?;
|
|
||||||
assert_eq!(stored.state, TaskState::Succeeded);
|
assert_eq!(stored.state, TaskState::Succeeded);
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
|
|||||||
@@ -133,15 +133,15 @@ pub fn large_artifacts(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn persist(
|
pub async fn persist(db: &SurrealDbClient, artifacts: PipelineArtifacts) -> Result<(), AppError> {
|
||||||
db: &SurrealDbClient,
|
|
||||||
artifacts: PipelineArtifacts,
|
|
||||||
) -> Result<(), AppError> {
|
|
||||||
persist_artifacts(db, &tuning(), TEST_EMBEDDING_DIM, artifacts).await?;
|
persist_artifacts(db, &tuning(), TEST_EMBEDDING_DIM, artifacts).await?;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn count_chunks_for_source(db: &SurrealDbClient, source_id: &str) -> anyhow::Result<usize> {
|
pub async fn count_chunks_for_source(
|
||||||
|
db: &SurrealDbClient,
|
||||||
|
source_id: &str,
|
||||||
|
) -> anyhow::Result<usize> {
|
||||||
let chunks: Vec<TextChunk> = db
|
let chunks: Vec<TextChunk> = db
|
||||||
.client
|
.client
|
||||||
.query("SELECT * FROM text_chunk WHERE source_id = $source_id;")
|
.query("SELECT * FROM text_chunk WHERE source_id = $source_id;")
|
||||||
|
|||||||
@@ -1,5 +1,15 @@
|
|||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
|
||||||
|
use super::{
|
||||||
|
config::{IngestionConfig, IngestionTuning},
|
||||||
|
enrichment_result::LLMEnrichmentResult,
|
||||||
|
services::PipelineServices,
|
||||||
|
test_support::{
|
||||||
|
count_chunks_for_source, count_entities_for_source, count_relationships_for_source,
|
||||||
|
persist, sample_artifacts, setup_db,
|
||||||
|
},
|
||||||
|
IngestionPipeline,
|
||||||
|
};
|
||||||
use crate::pipeline::context::{EmbeddedKnowledgeEntity, EmbeddedTextChunk};
|
use crate::pipeline::context::{EmbeddedKnowledgeEntity, EmbeddedTextChunk};
|
||||||
use anyhow::{self, Context};
|
use anyhow::{self, Context};
|
||||||
use async_trait::async_trait;
|
use async_trait::async_trait;
|
||||||
@@ -20,16 +30,6 @@ use common::{
|
|||||||
};
|
};
|
||||||
use retrieval_pipeline::{RetrievedChunk, RetrievedEntity};
|
use retrieval_pipeline::{RetrievedChunk, RetrievedEntity};
|
||||||
use tokio::sync::Mutex;
|
use tokio::sync::Mutex;
|
||||||
use super::{
|
|
||||||
config::{IngestionConfig, IngestionTuning},
|
|
||||||
enrichment_result::LLMEnrichmentResult,
|
|
||||||
services::PipelineServices,
|
|
||||||
test_support::{
|
|
||||||
count_chunks_for_source, count_entities_for_source, count_relationships_for_source,
|
|
||||||
persist, sample_artifacts, setup_db,
|
|
||||||
},
|
|
||||||
IngestionPipeline,
|
|
||||||
};
|
|
||||||
|
|
||||||
pub(crate) struct MockServices {
|
pub(crate) struct MockServices {
|
||||||
text_content: TextContent,
|
text_content: TextContent,
|
||||||
@@ -221,9 +221,7 @@ impl PipelineServices for FailingServices {
|
|||||||
content: &TextContent,
|
content: &TextContent,
|
||||||
analysis: &LLMEnrichmentResult,
|
analysis: &LLMEnrichmentResult,
|
||||||
) -> Result<(Vec<EmbeddedKnowledgeEntity>, Vec<KnowledgeRelationship>), AppError> {
|
) -> Result<(Vec<EmbeddedKnowledgeEntity>, Vec<KnowledgeRelationship>), AppError> {
|
||||||
self.inner
|
self.inner.convert_analysis(content, analysis).await
|
||||||
.convert_analysis(content, analysis)
|
|
||||||
.await
|
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn prepare_chunks(
|
async fn prepare_chunks(
|
||||||
|
|||||||
@@ -219,7 +219,7 @@ fn add_char_into_object(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
(&Value::Bool(true) | &Value::Bool(false), &ObjectStatus::Scalar { .. }, 'e')
|
(&Value::Bool(true) | &Value::Bool(false), &ObjectStatus::Scalar { .. }, 'e')
|
||||||
| (&Value::Object(_), &ObjectStatus::ValueQuoteClose, '}') => {
|
| (&Value::Object(_), &ObjectStatus::ValueQuoteClose, '}') => {
|
||||||
*current_status = ObjectStatus::Closed;
|
*current_status = ObjectStatus::Closed;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
+1
-6
@@ -60,12 +60,7 @@ async fn main() -> anyhow::Result<()> {
|
|||||||
worker_embedding,
|
worker_embedding,
|
||||||
)?);
|
)?);
|
||||||
|
|
||||||
run_worker_loop(
|
run_worker_loop(worker_db, ingestion_pipeline, index_rebuild_interval_secs).await
|
||||||
worker_db,
|
|
||||||
ingestion_pipeline,
|
|
||||||
index_rebuild_interval_secs,
|
|
||||||
)
|
|
||||||
.await
|
|
||||||
});
|
});
|
||||||
|
|
||||||
tokio::select! {
|
tokio::select! {
|
||||||
|
|||||||
+1
-3
@@ -74,9 +74,7 @@ mod tests {
|
|||||||
let db = Arc::clone(&services.db);
|
let db = Arc::clone(&services.db);
|
||||||
let pipeline = Arc::new(pipeline);
|
let pipeline = Arc::new(pipeline);
|
||||||
let worker =
|
let worker =
|
||||||
tokio::spawn(async move {
|
tokio::spawn(async move { ingestion_pipeline::run_worker_loop(db, pipeline, 0).await });
|
||||||
ingestion_pipeline::run_worker_loop(db, pipeline, 0).await
|
|
||||||
});
|
|
||||||
|
|
||||||
tokio::time::sleep(Duration::from_millis(250)).await;
|
tokio::time::sleep(Duration::from_millis(250)).await;
|
||||||
assert!(
|
assert!(
|
||||||
|
|||||||
Reference in New Issue
Block a user