mirror of
https://github.com/perstarkse/minne.git
synced 2026-04-25 10:18:38 +02:00
retrieval simplfied
This commit is contained in:
@@ -1,3 +1,8 @@
|
||||
#![allow(
|
||||
clippy::missing_docs_in_private_items,
|
||||
clippy::result_large_err
|
||||
)]
|
||||
|
||||
pub mod pipeline;
|
||||
pub mod utils;
|
||||
|
||||
|
||||
@@ -31,17 +31,8 @@ impl Default for IngestionTuning {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub struct IngestionConfig {
|
||||
pub tuning: IngestionTuning,
|
||||
pub chunk_only: bool,
|
||||
}
|
||||
|
||||
impl Default for IngestionConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
tuning: IngestionTuning::default(),
|
||||
chunk_only: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -52,7 +52,7 @@ impl LLMEnrichmentResult {
|
||||
entity_concurrency: usize,
|
||||
embedding_provider: Option<&EmbeddingProvider>,
|
||||
) -> Result<(Vec<EmbeddedKnowledgeEntity>, Vec<KnowledgeRelationship>), AppError> {
|
||||
let mapper = Arc::new(self.create_mapper()?);
|
||||
let mapper = Arc::new(self.create_mapper());
|
||||
|
||||
let entities = self
|
||||
.process_entities(
|
||||
@@ -66,21 +66,22 @@ impl LLMEnrichmentResult {
|
||||
)
|
||||
.await?;
|
||||
|
||||
let relationships = self.process_relationships(source_id, user_id, Arc::clone(&mapper))?;
|
||||
let relationships = self.process_relationships(source_id, user_id, mapper.as_ref())?;
|
||||
|
||||
Ok((entities, relationships))
|
||||
}
|
||||
|
||||
fn create_mapper(&self) -> Result<GraphMapper, AppError> {
|
||||
fn create_mapper(&self) -> GraphMapper {
|
||||
let mut mapper = GraphMapper::new();
|
||||
|
||||
for entity in &self.knowledge_entities {
|
||||
mapper.assign_id(&entity.key);
|
||||
}
|
||||
|
||||
Ok(mapper)
|
||||
mapper
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
async fn process_entities(
|
||||
&self,
|
||||
source_id: &str,
|
||||
@@ -91,7 +92,7 @@ impl LLMEnrichmentResult {
|
||||
entity_concurrency: usize,
|
||||
embedding_provider: Option<&EmbeddingProvider>,
|
||||
) -> Result<Vec<EmbeddedKnowledgeEntity>, AppError> {
|
||||
stream::iter(self.knowledge_entities.iter().cloned().map(|entity| {
|
||||
stream::iter(self.knowledge_entities.clone().into_iter().map(|entity| {
|
||||
let mapper = Arc::clone(&mapper);
|
||||
let openai_client = openai_client.clone();
|
||||
let source_id = source_id.to_string();
|
||||
@@ -120,7 +121,7 @@ impl LLMEnrichmentResult {
|
||||
&self,
|
||||
source_id: &str,
|
||||
user_id: &str,
|
||||
mapper: Arc<GraphMapper>,
|
||||
mapper: &GraphMapper,
|
||||
) -> Result<Vec<KnowledgeRelationship>, AppError> {
|
||||
self.relationships
|
||||
.iter()
|
||||
@@ -170,9 +171,9 @@ async fn create_single_entity(
|
||||
id: assigned_id,
|
||||
created_at: now,
|
||||
updated_at: now,
|
||||
name: llm_entity.name.to_string(),
|
||||
description: llm_entity.description.to_string(),
|
||||
entity_type: KnowledgeEntityType::from(llm_entity.entity_type.to_string()),
|
||||
name: llm_entity.name.clone(),
|
||||
description: llm_entity.description.clone(),
|
||||
entity_type: KnowledgeEntityType::from(llm_entity.entity_type.clone()),
|
||||
source_id: source_id.to_string(),
|
||||
metadata: None,
|
||||
user_id: user_id.into(),
|
||||
|
||||
@@ -8,6 +8,7 @@ mod state;
|
||||
|
||||
pub use config::{IngestionConfig, IngestionTuning};
|
||||
pub use enrichment_result::{LLMEnrichmentResult, LLMKnowledgeEntity, LLMRelationship};
|
||||
#[allow(clippy::module_name_repetitions)]
|
||||
pub use services::{DefaultPipelineServices, PipelineServices};
|
||||
|
||||
use std::{
|
||||
@@ -37,6 +38,7 @@ use self::{
|
||||
state::ready,
|
||||
};
|
||||
|
||||
#[allow(clippy::module_name_repetitions)]
|
||||
pub struct IngestionPipeline {
|
||||
db: Arc<SurrealDbClient>,
|
||||
pipeline_config: IngestionConfig,
|
||||
@@ -44,7 +46,7 @@ pub struct IngestionPipeline {
|
||||
}
|
||||
|
||||
impl IngestionPipeline {
|
||||
pub async fn new(
|
||||
pub fn new(
|
||||
db: Arc<SurrealDbClient>,
|
||||
openai_client: Arc<Client<async_openai::config::OpenAIConfig>>,
|
||||
config: AppConfig,
|
||||
@@ -61,10 +63,9 @@ impl IngestionPipeline {
|
||||
embedding_provider,
|
||||
IngestionConfig::default(),
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
pub async fn new_with_config(
|
||||
pub fn new_with_config(
|
||||
db: Arc<SurrealDbClient>,
|
||||
openai_client: Arc<Client<async_openai::config::OpenAIConfig>>,
|
||||
config: AppConfig,
|
||||
@@ -74,9 +75,9 @@ impl IngestionPipeline {
|
||||
pipeline_config: IngestionConfig,
|
||||
) -> Result<Self, AppError> {
|
||||
let services = DefaultPipelineServices::new(
|
||||
db.clone(),
|
||||
openai_client.clone(),
|
||||
config.clone(),
|
||||
Arc::clone(&db),
|
||||
openai_client,
|
||||
config,
|
||||
reranker_pool,
|
||||
storage,
|
||||
embedding_provider,
|
||||
@@ -181,11 +182,17 @@ impl IngestionPipeline {
|
||||
.saturating_sub(1)
|
||||
.min(tuning.retry_backoff_cap_exponent);
|
||||
let multiplier = 2_u64.pow(capped_attempt);
|
||||
let delay = tuning.retry_base_delay_secs * multiplier;
|
||||
let delay = tuning
|
||||
.retry_base_delay_secs
|
||||
.saturating_mul(multiplier);
|
||||
|
||||
Duration::from_secs(delay.min(tuning.retry_max_delay_secs))
|
||||
}
|
||||
|
||||
fn duration_millis(duration: Duration) -> u64 {
|
||||
u64::try_from(duration.as_millis()).unwrap_or(u64::MAX)
|
||||
}
|
||||
|
||||
#[tracing::instrument(
|
||||
skip_all,
|
||||
fields(task_id = %task.id, attempt = task.attempts, user_id = %task.user_id)
|
||||
@@ -231,14 +238,14 @@ impl IngestionPipeline {
|
||||
let persist_duration = stage_start.elapsed();
|
||||
|
||||
let total_duration = pipeline_started.elapsed();
|
||||
let prepare_ms = prepare_duration.as_millis() as u64;
|
||||
let retrieve_ms = retrieve_duration.as_millis() as u64;
|
||||
let enrich_ms = enrich_duration.as_millis() as u64;
|
||||
let persist_ms = persist_duration.as_millis() as u64;
|
||||
let prepare_ms = Self::duration_millis(prepare_duration);
|
||||
let retrieve_ms = Self::duration_millis(retrieve_duration);
|
||||
let enrich_ms = Self::duration_millis(enrich_duration);
|
||||
let persist_ms = Self::duration_millis(persist_duration);
|
||||
info!(
|
||||
task_id = %ctx.task_id,
|
||||
attempt = ctx.attempt,
|
||||
total_ms = total_duration.as_millis() as u64,
|
||||
total_ms = Self::duration_millis(total_duration),
|
||||
prepare_ms,
|
||||
retrieve_ms,
|
||||
enrich_ms,
|
||||
|
||||
@@ -228,7 +228,7 @@ impl PipelineServices for DefaultPipelineServices {
|
||||
) -> Result<(Vec<EmbeddedKnowledgeEntity>, Vec<KnowledgeRelationship>), AppError> {
|
||||
analysis
|
||||
.to_database_entities(
|
||||
&content.get_id(),
|
||||
content.get_id(),
|
||||
&content.user_id,
|
||||
&self.openai_client,
|
||||
&self.db,
|
||||
@@ -327,13 +327,13 @@ fn truncate_for_embedding(text: &str, max_chars: usize) -> String {
|
||||
return text.to_string();
|
||||
}
|
||||
|
||||
let mut truncated = String::with_capacity(max_chars + 3);
|
||||
let mut truncated = String::with_capacity(max_chars.saturating_add(3));
|
||||
for (idx, ch) in text.chars().enumerate() {
|
||||
if idx >= max_chars {
|
||||
break;
|
||||
}
|
||||
truncated.push(ch);
|
||||
}
|
||||
truncated.push_str("…");
|
||||
truncated.push('…');
|
||||
truncated
|
||||
}
|
||||
|
||||
@@ -20,6 +20,22 @@ use super::{
|
||||
state::{ContentPrepared, Enriched, IngestionMachine, Persisted, Ready, Retrieved},
|
||||
};
|
||||
|
||||
const STORE_RELATIONSHIPS: &str = r"
|
||||
BEGIN TRANSACTION;
|
||||
LET $relationships = $relationships;
|
||||
|
||||
FOR $relationship IN $relationships {
|
||||
LET $in_node = type::thing('knowledge_entity', $relationship.in);
|
||||
LET $out_node = type::thing('knowledge_entity', $relationship.out);
|
||||
RELATE $in_node->relates_to->$out_node CONTENT {
|
||||
id: type::thing('relates_to', $relationship.id),
|
||||
metadata: $relationship.metadata
|
||||
};
|
||||
};
|
||||
|
||||
COMMIT TRANSACTION;
|
||||
";
|
||||
|
||||
#[instrument(
|
||||
level = "trace",
|
||||
skip_all,
|
||||
@@ -40,8 +56,7 @@ pub async fn prepare_content(
|
||||
let context_len = text_content
|
||||
.context
|
||||
.as_ref()
|
||||
.map(|c| c.chars().count())
|
||||
.unwrap_or(0);
|
||||
.map_or(0, |c| c.chars().count());
|
||||
|
||||
tracing::info!(
|
||||
task_id = %ctx.task_id,
|
||||
@@ -65,7 +80,7 @@ pub async fn prepare_content(
|
||||
|
||||
machine
|
||||
.prepare()
|
||||
.map_err(|(_, guard)| map_guard_error("prepare", guard))
|
||||
.map_err(|(_, guard)| map_guard_error("prepare", &guard))
|
||||
}
|
||||
|
||||
#[instrument(
|
||||
@@ -80,7 +95,7 @@ pub async fn retrieve_related(
|
||||
if ctx.pipeline_config.chunk_only {
|
||||
return machine
|
||||
.retrieve()
|
||||
.map_err(|(_, guard)| map_guard_error("retrieve", guard));
|
||||
.map_err(|(_, guard)| map_guard_error("retrieve", &guard));
|
||||
}
|
||||
|
||||
let content = ctx.text_content()?;
|
||||
@@ -97,7 +112,7 @@ pub async fn retrieve_related(
|
||||
|
||||
machine
|
||||
.retrieve()
|
||||
.map_err(|(_, guard)| map_guard_error("retrieve", guard))
|
||||
.map_err(|(_, guard)| map_guard_error("retrieve", &guard))
|
||||
}
|
||||
|
||||
#[instrument(
|
||||
@@ -116,7 +131,7 @@ pub async fn enrich(
|
||||
});
|
||||
return machine
|
||||
.enrich()
|
||||
.map_err(|(_, guard)| map_guard_error("enrich", guard));
|
||||
.map_err(|(_, guard)| map_guard_error("enrich", &guard));
|
||||
}
|
||||
|
||||
let content = ctx.text_content()?;
|
||||
@@ -137,7 +152,7 @@ pub async fn enrich(
|
||||
|
||||
machine
|
||||
.enrich()
|
||||
.map_err(|(_, guard)| map_guard_error("enrich", guard))
|
||||
.map_err(|(_, guard)| map_guard_error("enrich", &guard))
|
||||
}
|
||||
|
||||
#[instrument(
|
||||
@@ -182,10 +197,10 @@ pub async fn persist(
|
||||
|
||||
machine
|
||||
.persist()
|
||||
.map_err(|(_, guard)| map_guard_error("persist", guard))
|
||||
.map_err(|(_, guard)| map_guard_error("persist", &guard))
|
||||
}
|
||||
|
||||
fn map_guard_error(event: &str, guard: GuardError) -> AppError {
|
||||
fn map_guard_error(event: &str, guard: &GuardError) -> AppError {
|
||||
AppError::InternalError(format!(
|
||||
"invalid ingestion pipeline transition during {event}: {guard:?}"
|
||||
))
|
||||
@@ -206,43 +221,31 @@ async fn store_graph_entities(
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
const STORE_RELATIONSHIPS: &str = r"
|
||||
BEGIN TRANSACTION;
|
||||
LET $relationships = $relationships;
|
||||
|
||||
FOR $relationship IN $relationships {
|
||||
LET $in_node = type::thing('knowledge_entity', $relationship.in);
|
||||
LET $out_node = type::thing('knowledge_entity', $relationship.out);
|
||||
RELATE $in_node->relates_to->$out_node CONTENT {
|
||||
id: type::thing('relates_to', $relationship.id),
|
||||
metadata: $relationship.metadata
|
||||
};
|
||||
};
|
||||
|
||||
COMMIT TRANSACTION;
|
||||
";
|
||||
|
||||
let relationships = Arc::new(relationships);
|
||||
|
||||
let mut backoff_ms = tuning.graph_initial_backoff_ms;
|
||||
let last_attempt = tuning.graph_store_attempts.saturating_sub(1);
|
||||
|
||||
for attempt in 0..tuning.graph_store_attempts {
|
||||
let result = db
|
||||
.client
|
||||
.query(STORE_RELATIONSHIPS)
|
||||
.bind(("relationships", relationships.clone()))
|
||||
.bind(("relationships", Arc::clone(&relationships)))
|
||||
.await;
|
||||
|
||||
match result {
|
||||
Ok(_) => return Ok(()),
|
||||
Err(err) => {
|
||||
if is_retryable_conflict(&err) && attempt + 1 < tuning.graph_store_attempts {
|
||||
if is_retryable_conflict(&err) && attempt < last_attempt {
|
||||
let next_attempt = attempt.saturating_add(1);
|
||||
warn!(
|
||||
attempt = attempt + 1,
|
||||
attempt = next_attempt,
|
||||
"Transient SurrealDB conflict while storing graph data; retrying"
|
||||
);
|
||||
sleep(Duration::from_millis(backoff_ms)).await;
|
||||
backoff_ms = (backoff_ms * 2).min(tuning.graph_max_backoff_ms);
|
||||
backoff_ms = backoff_ms
|
||||
.saturating_mul(2)
|
||||
.min(tuning.graph_max_backoff_ms);
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
@@ -65,7 +65,7 @@ fn infer_extension(file_info: &FileInfo) -> Option<String> {
|
||||
Path::new(&file_info.path)
|
||||
.extension()
|
||||
.and_then(|ext| ext.to_str())
|
||||
.map(|ext| ext.to_string())
|
||||
.map(std::string::ToString::to_string)
|
||||
}
|
||||
|
||||
pub async fn extract_text_from_file(
|
||||
|
||||
@@ -116,6 +116,7 @@ async fn load_page_numbers(pdf_bytes: Vec<u8>) -> Result<Vec<u32>, AppError> {
|
||||
}
|
||||
|
||||
/// Uses the existing headless Chrome dependency to rasterize the requested PDF pages into PNGs.
|
||||
#[allow(clippy::too_many_lines)]
|
||||
async fn render_pdf_pages(file_path: &Path, pages: &[u32]) -> Result<Vec<Vec<u8>>, AppError> {
|
||||
let file_url = url::Url::from_file_path(file_path)
|
||||
.map_err(|()| AppError::Processing("Unable to construct PDF file URL".into()))?;
|
||||
@@ -148,7 +149,7 @@ async fn render_pdf_pages(file_path: &Path, pages: &[u32]) -> Result<Vec<Vec<u8>
|
||||
loaded = true;
|
||||
break;
|
||||
}
|
||||
if attempt + 1 < NAVIGATION_RETRY_ATTEMPTS {
|
||||
if attempt < NAVIGATION_RETRY_ATTEMPTS.saturating_sub(1) {
|
||||
sleep(Duration::from_millis(NAVIGATION_RETRY_INTERVAL_MS)).await;
|
||||
}
|
||||
}
|
||||
@@ -172,7 +173,7 @@ async fn render_pdf_pages(file_path: &Path, pages: &[u32]) -> Result<Vec<Vec<u8>
|
||||
break;
|
||||
}
|
||||
Ok(None) => {
|
||||
if attempt + 1 < CANVAS_VIEWPORT_ATTEMPTS {
|
||||
if attempt < CANVAS_VIEWPORT_ATTEMPTS.saturating_sub(1) {
|
||||
tokio::time::sleep(Duration::from_millis(CANVAS_VIEWPORT_WAIT_MS)).await;
|
||||
}
|
||||
}
|
||||
@@ -260,6 +261,7 @@ fn create_browser() -> Result<Browser, AppError> {
|
||||
}
|
||||
|
||||
/// Sends one or more rendered pages to the configured multimodal model and stitches the resulting Markdown chunks together.
|
||||
#[allow(clippy::too_many_lines)]
|
||||
async fn vision_markdown(
|
||||
rendered_pages: Vec<Vec<u8>>,
|
||||
db: &SurrealDbClient,
|
||||
@@ -303,10 +305,11 @@ async fn vision_markdown(
|
||||
|
||||
let mut batch_markdown: Option<String> = None;
|
||||
|
||||
let last_attempt = MAX_VISION_ATTEMPTS.saturating_sub(1);
|
||||
for attempt in 0..MAX_VISION_ATTEMPTS {
|
||||
let prompt_text = prompt_for_attempt(attempt, prompt);
|
||||
|
||||
let mut content_parts = Vec::with_capacity(encoded_images.len() + 1);
|
||||
let mut content_parts = Vec::with_capacity(encoded_images.len().saturating_add(1));
|
||||
content_parts.push(
|
||||
ChatCompletionRequestMessageContentPartTextArgs::default()
|
||||
.text(prompt_text)
|
||||
@@ -375,7 +378,7 @@ async fn vision_markdown(
|
||||
batch = batch_idx,
|
||||
attempt, "Vision model returned low quality response"
|
||||
);
|
||||
if attempt + 1 == MAX_VISION_ATTEMPTS {
|
||||
if attempt == last_attempt {
|
||||
return Err(AppError::Processing(
|
||||
"Vision model failed to transcribe PDF page contents".into(),
|
||||
));
|
||||
@@ -400,6 +403,7 @@ async fn vision_markdown(
|
||||
}
|
||||
|
||||
/// Heuristic that determines whether the fast-path text looks like well-formed prose.
|
||||
#[allow(clippy::cast_precision_loss)]
|
||||
fn looks_good_enough(text: &str) -> bool {
|
||||
if text.len() < FAST_PATH_MIN_LEN {
|
||||
return false;
|
||||
|
||||
@@ -50,7 +50,7 @@ pub async fn extract_text_from_url(
|
||||
)?;
|
||||
|
||||
let mut tmp_file = NamedTempFile::new()?;
|
||||
let temp_path_str = format!("{:?}", tmp_file.path());
|
||||
let temp_path_str = tmp_file.path().display().to_string();
|
||||
|
||||
tmp_file.write_all(&screenshot)?;
|
||||
tmp_file.as_file().sync_all()?;
|
||||
@@ -108,14 +108,11 @@ fn ensure_ingestion_url_allowed(url: &url::Url) -> Result<String, AppError> {
|
||||
}
|
||||
}
|
||||
|
||||
let host = match url.host_str() {
|
||||
Some(host) => host,
|
||||
None => {
|
||||
warn!(%url, "Rejected ingestion URL missing host");
|
||||
return Err(AppError::Validation(
|
||||
"URL is missing a host component".to_string(),
|
||||
));
|
||||
}
|
||||
let Some(host) = url.host_str() else {
|
||||
warn!(%url, "Rejected ingestion URL missing host");
|
||||
return Err(AppError::Validation(
|
||||
"URL is missing a host component".to_string(),
|
||||
));
|
||||
};
|
||||
|
||||
if host.eq_ignore_ascii_case("localhost") {
|
||||
|
||||
Reference in New Issue
Block a user