mirror of
https://github.com/perstarkse/minne.git
synced 2026-04-24 17:58:31 +02:00
feat: state machine for tasks, multiple workers
This commit is contained in:
@@ -3,101 +3,47 @@ pub mod pipeline;
|
||||
pub mod types;
|
||||
pub mod utils;
|
||||
|
||||
use chrono::Utc;
|
||||
use common::storage::{
|
||||
db::SurrealDbClient,
|
||||
types::ingestion_task::{IngestionTask, IngestionTaskStatus},
|
||||
types::ingestion_task::{IngestionTask, DEFAULT_LEASE_SECS},
|
||||
};
|
||||
use futures::StreamExt;
|
||||
use pipeline::IngestionPipeline;
|
||||
use std::sync::Arc;
|
||||
use surrealdb::Action;
|
||||
use tracing::{error, info};
|
||||
use tokio::time::{sleep, Duration};
|
||||
use tracing::{error, info, warn};
|
||||
use uuid::Uuid;
|
||||
|
||||
pub async fn run_worker_loop(
|
||||
db: Arc<SurrealDbClient>,
|
||||
ingestion_pipeline: Arc<IngestionPipeline>,
|
||||
) -> Result<(), Box<dyn std::error::Error>> {
|
||||
let worker_id = format!("ingestion-worker-{}", Uuid::new_v4());
|
||||
let lease_duration = Duration::from_secs(DEFAULT_LEASE_SECS as u64);
|
||||
let idle_backoff = Duration::from_millis(500);
|
||||
|
||||
loop {
|
||||
// First, check for any unfinished tasks
|
||||
let unfinished_tasks = IngestionTask::get_unfinished_tasks(&db).await?;
|
||||
if !unfinished_tasks.is_empty() {
|
||||
info!("Found {} unfinished jobs", unfinished_tasks.len());
|
||||
for task in unfinished_tasks {
|
||||
ingestion_pipeline.process_task(task).await?;
|
||||
}
|
||||
}
|
||||
|
||||
// If no unfinished jobs, start listening for new ones
|
||||
info!("Listening for new jobs...");
|
||||
let mut job_stream = IngestionTask::listen_for_tasks(&db).await?;
|
||||
while let Some(notification) = job_stream.next().await {
|
||||
match notification {
|
||||
Ok(notification) => {
|
||||
info!("Received notification: {:?}", notification);
|
||||
match notification.action {
|
||||
Action::Create => {
|
||||
if let Err(e) = ingestion_pipeline.process_task(notification.data).await
|
||||
{
|
||||
error!("Error processing task: {}", e);
|
||||
}
|
||||
}
|
||||
Action::Update => {
|
||||
match notification.data.status {
|
||||
IngestionTaskStatus::Completed
|
||||
| IngestionTaskStatus::Error { .. }
|
||||
| IngestionTaskStatus::Cancelled => {
|
||||
info!(
|
||||
"Skipping already completed/error/cancelled task: {}",
|
||||
notification.data.id
|
||||
);
|
||||
continue;
|
||||
}
|
||||
IngestionTaskStatus::InProgress { attempts, .. } => {
|
||||
// Only process if this is a retry after an error, not our own update
|
||||
if let Ok(Some(current_task)) =
|
||||
db.get_item::<IngestionTask>(¬ification.data.id).await
|
||||
{
|
||||
match current_task.status {
|
||||
IngestionTaskStatus::Error { .. }
|
||||
if attempts
|
||||
< common::storage::types::ingestion_task::MAX_ATTEMPTS =>
|
||||
{
|
||||
// This is a retry after an error
|
||||
if let Err(e) =
|
||||
ingestion_pipeline.process_task(current_task).await
|
||||
{
|
||||
error!("Error processing task retry: {}", e);
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
info!(
|
||||
"Skipping in-progress update for task: {}",
|
||||
notification.data.id
|
||||
);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
IngestionTaskStatus::Created => {
|
||||
// Shouldn't happen with Update action, but process if it does
|
||||
if let Err(e) =
|
||||
ingestion_pipeline.process_task(notification.data).await
|
||||
{
|
||||
error!("Error processing task: {}", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
_ => {} // Ignore other actions
|
||||
}
|
||||
match IngestionTask::claim_next_ready(&db, &worker_id, Utc::now(), lease_duration).await {
|
||||
Ok(Some(task)) => {
|
||||
let task_id = task.id.clone();
|
||||
info!(
|
||||
%worker_id,
|
||||
%task_id,
|
||||
attempt = task.attempts,
|
||||
"claimed ingestion task"
|
||||
);
|
||||
if let Err(err) = ingestion_pipeline.process_task(task).await {
|
||||
error!(%worker_id, %task_id, error = %err, "ingestion task failed");
|
||||
}
|
||||
Err(e) => error!("Error in job notification: {}", e),
|
||||
}
|
||||
Ok(None) => {
|
||||
sleep(idle_backoff).await;
|
||||
}
|
||||
Err(err) => {
|
||||
error!(%worker_id, error = %err, "failed to claim ingestion task");
|
||||
warn!("Backing off for 1s after claim error");
|
||||
sleep(Duration::from_secs(1)).await;
|
||||
}
|
||||
}
|
||||
|
||||
// If we reach here, the stream has ended (connection lost?)
|
||||
error!("Database stream ended unexpectedly, reconnecting...");
|
||||
tokio::time::sleep(tokio::time::Duration::from_secs(5)).await;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,16 +1,15 @@
|
||||
use std::{sync::Arc, time::Instant};
|
||||
|
||||
use chrono::Utc;
|
||||
use text_splitter::TextSplitter;
|
||||
use tokio::time::{sleep, Duration};
|
||||
use tracing::{info, warn};
|
||||
use tracing::{info, info_span, warn};
|
||||
|
||||
use common::{
|
||||
error::AppError,
|
||||
storage::{
|
||||
db::SurrealDbClient,
|
||||
types::{
|
||||
ingestion_task::{IngestionTask, IngestionTaskStatus, MAX_ATTEMPTS},
|
||||
ingestion_task::{IngestionTask, TaskErrorInfo},
|
||||
knowledge_entity::KnowledgeEntity,
|
||||
knowledge_relationship::KnowledgeRelationship,
|
||||
text_chunk::TextChunk,
|
||||
@@ -44,47 +43,81 @@ impl IngestionPipeline {
|
||||
})
|
||||
}
|
||||
pub async fn process_task(&self, task: IngestionTask) -> Result<(), AppError> {
|
||||
let current_attempts = match task.status {
|
||||
IngestionTaskStatus::InProgress { attempts, .. } => attempts + 1,
|
||||
_ => 1,
|
||||
};
|
||||
let task_id = task.id.clone();
|
||||
let attempt = task.attempts;
|
||||
let worker_label = task
|
||||
.worker_id
|
||||
.clone()
|
||||
.unwrap_or_else(|| "unknown-worker".to_string());
|
||||
let span = info_span!(
|
||||
"ingestion_task",
|
||||
%task_id,
|
||||
attempt,
|
||||
worker_id = %worker_label,
|
||||
state = %task.state.as_str()
|
||||
);
|
||||
let _enter = span.enter();
|
||||
let processing_task = task.mark_processing(&self.db).await?;
|
||||
|
||||
// Update status to InProgress with attempt count
|
||||
IngestionTask::update_status(
|
||||
&task.id,
|
||||
IngestionTaskStatus::InProgress {
|
||||
attempts: current_attempts,
|
||||
last_attempt: Utc::now(),
|
||||
},
|
||||
let text_content = to_text_content(
|
||||
processing_task.content.clone(),
|
||||
&self.db,
|
||||
&self.config,
|
||||
&self.openai_client,
|
||||
)
|
||||
.await?;
|
||||
|
||||
let text_content =
|
||||
to_text_content(task.content, &self.db, &self.config, &self.openai_client).await?;
|
||||
|
||||
match self.process(&text_content).await {
|
||||
Ok(_) => {
|
||||
IngestionTask::update_status(&task.id, IngestionTaskStatus::Completed, &self.db)
|
||||
.await?;
|
||||
processing_task.mark_succeeded(&self.db).await?;
|
||||
info!(%task_id, attempt, "ingestion task succeeded");
|
||||
Ok(())
|
||||
}
|
||||
Err(e) => {
|
||||
if current_attempts >= MAX_ATTEMPTS {
|
||||
IngestionTask::update_status(
|
||||
&task.id,
|
||||
IngestionTaskStatus::Error {
|
||||
message: format!("Max attempts reached: {}", e),
|
||||
},
|
||||
&self.db,
|
||||
)
|
||||
.await?;
|
||||
Err(err) => {
|
||||
let reason = err.to_string();
|
||||
let error_info = TaskErrorInfo {
|
||||
code: None,
|
||||
message: reason.clone(),
|
||||
};
|
||||
|
||||
if processing_task.can_retry() {
|
||||
let delay = Self::retry_delay(processing_task.attempts);
|
||||
processing_task
|
||||
.mark_failed(error_info, delay, &self.db)
|
||||
.await?;
|
||||
warn!(
|
||||
%task_id,
|
||||
attempt = processing_task.attempts,
|
||||
retry_in_secs = delay.as_secs(),
|
||||
"ingestion task failed; scheduled retry"
|
||||
);
|
||||
} else {
|
||||
processing_task
|
||||
.mark_dead_letter(error_info, &self.db)
|
||||
.await?;
|
||||
warn!(
|
||||
%task_id,
|
||||
attempt = processing_task.attempts,
|
||||
"ingestion task failed; moved to dead letter queue"
|
||||
);
|
||||
}
|
||||
Err(AppError::Processing(e.to_string()))
|
||||
|
||||
Err(AppError::Processing(reason))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn retry_delay(attempt: u32) -> Duration {
|
||||
const BASE_SECONDS: u64 = 30;
|
||||
const MAX_SECONDS: u64 = 15 * 60;
|
||||
|
||||
let capped_attempt = attempt.saturating_sub(1).min(5) as u32;
|
||||
let multiplier = 2_u64.pow(capped_attempt);
|
||||
let delay = BASE_SECONDS * multiplier;
|
||||
|
||||
Duration::from_secs(delay.min(MAX_SECONDS))
|
||||
}
|
||||
|
||||
pub async fn process(&self, content: &TextContent) -> Result<(), AppError> {
|
||||
let now = Instant::now();
|
||||
|
||||
|
||||
Reference in New Issue
Block a user