ingestion-pipeline crated init, begun moving

2026-04-25 02:08:30 +02:00 · 2025-03-06 15:29:13 +01:00
parent ef1478547e
commit 1a641db503
21 changed files with 648 additions and 601 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1071,7 +1071,6 @@ dependencies = [
 "mockall",
 "plotly",
 "reqwest",
 "scraper",
 "serde",
 "serde_json",
 "sha2",
@@ -1079,7 +1078,6 @@ dependencies = [
 "tempfile",
 "text-splitter",
 "thiserror",
 "tiktoken-rs",
 "tokio",
 "tower-http",
 "tracing",
@@ -2429,6 +2427,24 @@ dependencies = [
 "serde",
 ]
 [[package]]
 name = "ingestion-pipeline"
 version = "0.1.0"
 dependencies = [
 "async-openai",
 "axum",
 "chrono",
 "common",
 "reqwest",
 "scraper",
 "serde",
 "serde_json",
 "text-splitter",
 "tiktoken-rs",
 "tokio",
 "tracing",
 ]
 [[package]]
 name = "inotify"
 version = "0.10.2"
@@ -2734,6 +2750,7 @@ dependencies = [
 "config",
 "futures",
 "html-router",
 "ingestion-pipeline",
 "json-stream-parser",
 "lettre",
 "mime",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -3,7 +3,7 @@ members = [
  "crates/main",
  "crates/common",
  "crates/api-router"
-, "crates/html-router"]
+, "crates/html-router", "crates/ingestion-pipeline"]
 resolver = "2"
 [workspace.dependencies]
--- a/crates/api-router/src/lib.rs
+++ b/crates/api-router/src/lib.rs
@@ -6,7 +6,7 @@ use axum::{
    Router,
 };
 use middleware_api_auth::api_auth;
-use routes::ingress::ingress_data;
+use routes::ingress::ingest_data;
 pub mod api_state;
 mod middleware_api_auth;
@@ -19,7 +19,7 @@ where
    ApiState: FromRef<S>,
 {
    Router::new()
-        .route("/ingress", post(ingress_data))
+        .route("/ingress", post(ingest_data))
        .layer(DefaultBodyLimit::max(1024 * 1024 * 1024))
        .route_layer(from_fn_with_state(app_state.clone(), api_auth))
 }
--- a/crates/api-router/src/routes/ingress.rs
+++ b/crates/api-router/src/routes/ingress.rs
@@ -2,17 +2,19 @@ use axum::{extract::State, http::StatusCode, response::IntoResponse, Extension};
 use axum_typed_multipart::{FieldData, TryFromMultipart, TypedMultipart};
 use common::{
    error::{ApiError, AppError},
-    ingress::ingress_object::IngressObject,
+    storage::types::{
-    storage::types::{file_info::FileInfo, job::Job, user::User},
+        file_info::FileInfo, ingestion_payload::IngestionPayload, ingestion_task::IngestionTask,
        user::User,
    },
 };
 use futures::{future::try_join_all, TryFutureExt};
 use tempfile::NamedTempFile;
-use tracing::{debug, info};
+use tracing::info;
 use crate::api_state::ApiState;
 #[derive(Debug, TryFromMultipart)]
-pub struct IngressParams {
+pub struct IngestParams {
    pub content: Option<String>,
    pub instructions: String,
    pub category: String,
@@ -21,10 +23,10 @@ pub struct IngressParams {
    pub files: Vec<FieldData<NamedTempFile>>,
 }
-pub async fn ingress_data(
+pub async fn ingest_data(
    State(state): State<ApiState>,
    Extension(user): Extension<User>,
-    TypedMultipart(input): TypedMultipart<IngressParams>,
+    TypedMultipart(input): TypedMultipart<IngestParams>,
 ) -> Result<impl IntoResponse, ApiError> {
    info!("Received input: {:?}", input);
@@ -36,20 +38,19 @@ pub async fn ingress_data(
    )
    .await?;
-    debug!("Got file infos");
+    let payloads = IngestionPayload::create_ingestion_payload(
    let ingress_objects = IngressObject::create_ingress_objects(
        input.content,
        input.instructions,
        input.category,
        file_infos,
        user.id.as_str(),
    )?;
    debug!("Got ingress objects");
-    let futures: Vec<_> = ingress_objects
+    let futures: Vec<_> = payloads
        .into_iter()
-        .map(|object| Job::create_and_add_to_db(object.clone(), user.id.clone(), &state.db))
+        .map(|object| {
            IngestionTask::create_and_add_to_db(object.clone(), user.id.clone(), &state.db)
        })
        .collect();
    try_join_all(futures).await.map_err(AppError::from)?;
--- a/crates/common/Cargo.toml
+++ b/crates/common/Cargo.toml
@@ -34,12 +34,10 @@ minijinja-contrib = { version = "2.6.0", features = ["datetime", "timezone"] }
 mockall = "0.13.0"
 plotly = "0.12.1"
 reqwest = {version = "0.12.12", features = ["charset", "json"]}
 scraper = "0.22.0"
 sha2 = "0.10.8"
 surrealdb = "2.0.4"
 tempfile = "3.12.0"
 text-splitter = "0.18.1"
 tiktoken-rs = "0.6.0"
 tower-http = { version = "0.6.2", features = ["fs"] }
 tracing-subscriber = { version = "0.3.18", features = ["env-filter"] }
 url = { version = "2.5.2", features = ["serde"] }
--- a/crates/common/src/ingress/content_processor.rs
+++ b/crates/common/src/ingress/content_processor.rs
@@ -1,162 +1 @@
 use std::{sync::Arc, time::Instant};
 use chrono::Utc;
 use text_splitter::TextSplitter;
 use tracing::{debug, info};
 use crate::{
    error::AppError,
    storage::{
        db::SurrealDbClient,
        types::{
            job::{Job, JobStatus, MAX_ATTEMPTS},
            knowledge_entity::KnowledgeEntity,
            knowledge_relationship::KnowledgeRelationship,
            text_chunk::TextChunk,
            text_content::TextContent,
        },
    },
    utils::embedding::generate_embedding,
 };
 use super::analysis::{
    ingress_analyser::IngressAnalyzer, types::llm_analysis_result::LLMGraphAnalysisResult,
 };
 pub struct ContentProcessor {
    db: Arc<SurrealDbClient>,
    openai_client: Arc<async_openai::Client<async_openai::config::OpenAIConfig>>,
 }
 impl ContentProcessor {
    pub async fn new(
        db: Arc<SurrealDbClient>,
        openai_client: Arc<async_openai::Client<async_openai::config::OpenAIConfig>>,
    ) -> Result<Self, AppError> {
        Ok(Self { db, openai_client })
    }
    pub async fn process_job(&self, job: Job) -> Result<(), AppError> {
        let current_attempts = match job.status {
            JobStatus::InProgress { attempts, .. } => attempts + 1,
            _ => 1,
        };
        // Update status to InProgress with attempt count
        Job::update_status(
            &job.id,
            JobStatus::InProgress {
                attempts: current_attempts,
                last_attempt: Utc::now(),
            },
            &self.db,
        )
        .await?;
        let text_content = job.content.to_text_content(&self.openai_client).await?;
        match self.process(&text_content).await {
            Ok(_) => {
                Job::update_status(&job.id, JobStatus::Completed, &self.db).await?;
                Ok(())
            }
            Err(e) => {
                if current_attempts >= MAX_ATTEMPTS {
                    Job::update_status(
                        &job.id,
                        JobStatus::Error(format!("Max attempts reached: {}", e)),
                        &self.db,
                    )
                    .await?;
                }
                Err(AppError::Processing(e.to_string()))
            }
        }
    }
    pub async fn process(&self, content: &TextContent) -> Result<(), AppError> {
        let now = Instant::now();
        // Perform analyis, this step also includes retrieval
        let analysis = self.perform_semantic_analysis(content).await?;
        let end = now.elapsed();
        info!(
            "{:?} time elapsed during creation of entities and relationships",
            end
        );
        // Convert analysis to objects
        let (entities, relationships) = analysis
            .to_database_entities(&content.id, &content.user_id, &self.openai_client)
            .await?;
        // Store everything
        tokio::try_join!(
            self.store_graph_entities(entities, relationships),
            self.store_vector_chunks(content),
        )?;
        // Store original content
        self.db.store_item(content.to_owned()).await?;
        self.db.rebuild_indexes().await?;
        Ok(())
    }
    async fn perform_semantic_analysis(
        &self,
        content: &TextContent,
    ) -> Result<LLMGraphAnalysisResult, AppError> {
        let analyser = IngressAnalyzer::new(&self.db, &self.openai_client);
        analyser
            .analyze_content(
                &content.category,
                &content.instructions,
                &content.text,
                &content.user_id,
            )
            .await
    }
    async fn store_graph_entities(
        &self,
        entities: Vec<KnowledgeEntity>,
        relationships: Vec<KnowledgeRelationship>,
    ) -> Result<(), AppError> {
        for entity in &entities {
            debug!("Storing entity: {:?}", entity);
            self.db.store_item(entity.clone()).await?;
        }
        for relationship in &relationships {
            debug!("Storing relationship: {:?}", relationship);
            relationship.store_relationship(&self.db).await?;
        }
        info!(
            "Stored {} entities and {} relationships",
            entities.len(),
            relationships.len()
        );
        Ok(())
    }
    async fn store_vector_chunks(&self, content: &TextContent) -> Result<(), AppError> {
        let splitter = TextSplitter::new(500..2000);
        let chunks = splitter.chunks(&content.text);
        // Could potentially process chunks in parallel with a bounded concurrent limit
        for chunk in chunks {
            let embedding = generate_embedding(&self.openai_client, chunk).await?;
            let text_chunk = TextChunk::new(
                content.id.to_string(),
                chunk.to_string(),
                embedding,
                content.user_id.to_string(),
            );
            self.db.store_item(text_chunk).await?;
        }
        Ok(())
    }
 }
--- a/crates/common/src/ingress/ingress_object.rs
+++ b/crates/common/src/ingress/ingress_object.rs
@@ -1,345 +0,0 @@
 use std::{sync::Arc, time::Duration};
 use crate::{
    error::AppError,
    storage::types::{file_info::FileInfo, text_content::TextContent},
 };
 use async_openai::types::{
    ChatCompletionRequestSystemMessage, ChatCompletionRequestUserMessage,
    CreateChatCompletionRequestArgs,
 };
 use reqwest;
 use scraper::{Html, Selector};
 use serde::{Deserialize, Serialize};
 use std::fmt::Write;
 use tiktoken_rs::{o200k_base, CoreBPE};
 use tracing::info;
 use url::Url;
 #[derive(Debug, Serialize, Deserialize, Clone)]
 pub enum IngressObject {
    Url {
        url: String,
        instructions: String,
        category: String,
        user_id: String,
    },
    Text {
        text: String,
        instructions: String,
        category: String,
        user_id: String,
    },
    File {
        file_info: FileInfo,
        instructions: String,
        category: String,
        user_id: String,
    },
 }
 impl IngressObject {
    /// Creates ingress objects from the provided content, instructions, and files.
    ///
    /// # Arguments
    /// * `content` - Optional textual content to be ingressed
    /// * `instructions` - Instructions for processing the ingress content
    /// * `category` - Category to classify the ingressed content
    /// * `files` - Vector of `FileInfo` objects containing information about uploaded files
    /// * `user_id` - Identifier of the user performing the ingress operation
    ///
    /// # Returns
    /// * `Result<Vec<IngressObject>, AppError>` - On success, returns a vector of ingress objects
    ///   (one per file/content type). On failure, returns an `AppError`.
    pub fn create_ingress_objects(
        content: Option<String>,
        instructions: String,
        category: String,
        files: Vec<FileInfo>,
        user_id: &str,
    ) -> Result<Vec<IngressObject>, AppError> {
        // Initialize list
        let mut object_list = Vec::new();
        // Create a IngressObject from content if it exists, checking for URL or text
        if let Some(input_content) = content {
            match Url::parse(&input_content) {
                Ok(url) => {
                    info!("Detected URL: {}", url);
                    object_list.push(IngressObject::Url {
                        url: url.to_string(),
                        instructions: instructions.clone(),
                        category: category.clone(),
                        user_id: user_id.into(),
                    });
                }
                Err(_) => {
                    if input_content.len() > 2 {
                        info!("Treating input as plain text");
                        object_list.push(IngressObject::Text {
                            text: input_content.to_string(),
                            instructions: instructions.clone(),
                            category: category.clone(),
                            user_id: user_id.into(),
                        });
                    }
                }
            }
        }
        for file in files {
            object_list.push(IngressObject::File {
                file_info: file,
                instructions: instructions.clone(),
                category: category.clone(),
                user_id: user_id.into(),
            })
        }
        // If no objects are constructed, we return Err
        if object_list.is_empty() {
            return Err(AppError::NotFound(
                "No valid content or files provided".into(),
            ));
        }
        Ok(object_list)
    }
    /// Creates a new `TextContent` instance from a `IngressObject`.
    ///
    /// # Arguments
    /// `&self` - A reference to the `IngressObject`.
    ///
    /// # Returns
    /// `TextContent` - An object containing a text representation of the object, could be a scraped URL, parsed PDF, etc.
    pub async fn to_text_content(
        &self,
        openai_client: &Arc<async_openai::Client<async_openai::config::OpenAIConfig>>,
    ) -> Result<TextContent, AppError> {
        match self {
            IngressObject::Url {
                url,
                instructions,
                category,
                user_id,
            } => {
                let text = Self::fetch_text_from_url(url, openai_client).await?;
                Ok(TextContent::new(
                    text,
                    instructions.into(),
                    category.into(),
                    None,
                    Some(url.into()),
                    user_id.into(),
                ))
            }
            IngressObject::Text {
                text,
                instructions,
                category,
                user_id,
            } => Ok(TextContent::new(
                text.into(),
                instructions.into(),
                category.into(),
                None,
                None,
                user_id.into(),
            )),
            IngressObject::File {
                file_info,
                instructions,
                category,
                user_id,
            } => {
                let text = Self::extract_text_from_file(file_info).await?;
                Ok(TextContent::new(
                    text,
                    instructions.into(),
                    category.into(),
                    Some(file_info.to_owned()),
                    None,
                    user_id.into(),
                ))
            }
        }
    }
    /// Get text from url, will return it as a markdown formatted string
    async fn fetch_text_from_url(
        url: &str,
        openai_client: &Arc<async_openai::Client<async_openai::config::OpenAIConfig>>,
    ) -> Result<String, AppError> {
        // Use a client with timeouts and reuse
        let client = reqwest::ClientBuilder::new()
            .timeout(Duration::from_secs(30))
            .build()?;
        let response = client.get(url).send().await?.text().await?;
        // Preallocate string with capacity
        let mut structured_content = String::with_capacity(response.len() / 2);
        let document = Html::parse_document(&response);
        let main_selectors = Selector::parse(
            "article, main, .article-content, .post-content, .entry-content, [role='main']",
        )
        .unwrap();
        let content_element = document
            .select(&main_selectors)
            .next()
            .or_else(|| document.select(&Selector::parse("body").unwrap()).next())
            .ok_or(AppError::NotFound("No content found".into()))?;
        // Compile selectors once
        let heading_selector = Selector::parse("h1, h2, h3").unwrap();
        let paragraph_selector = Selector::parse("p").unwrap();
        // Process content in one pass
        for element in content_element.select(&heading_selector) {
            let _ = writeln!(
                structured_content,
                "<heading>{}</heading>",
                element.text().collect::<String>().trim()
            );
        }
        for element in content_element.select(&paragraph_selector) {
            let _ = writeln!(
                structured_content,
                "<paragraph>{}</paragraph>",
                element.text().collect::<String>().trim()
            );
        }
        let content = structured_content
            .replace(|c: char| c.is_control(), " ")
            .replace("  ", " ");
        Self::process_web_content(content, openai_client).await
    }
    pub async fn process_web_content(
        content: String,
        openai_client: &Arc<async_openai::Client<async_openai::config::OpenAIConfig>>,
    ) -> Result<String, AppError> {
        const MAX_TOKENS: usize = 122000;
        const SYSTEM_PROMPT: &str = r#"
        You are a precise content extractor for web pages. Your task:
        1. Extract ONLY the main article/content from the provided text
        2. Maintain the original content - do not summarize or modify the core information
        3. Ignore peripheral content such as:
            - Navigation elements
            - Error messages (e.g., "JavaScript required")
            - Related articles sections
            - Comments
            - Social media links
            - Advertisement text
        FORMAT:
        - Convert <heading> tags to markdown headings (#, ##, ###)
        - Convert <paragraph> tags to markdown paragraphs
        - Preserve quotes and important formatting
        - Remove duplicate content
        - Remove any metadata or technical artifacts
        OUTPUT RULES:
        - Output ONLY the cleaned content in markdown
        - Do not add any explanations or meta-commentary
        - Do not add summaries or conclusions
        - Do not use any XML/HTML tags in the output
    "#;
        let bpe = o200k_base()?;
        // Process content in chunks if needed
        let truncated_content = if bpe.encode_with_special_tokens(&content).len() > MAX_TOKENS {
            Self::truncate_content(&content, MAX_TOKENS, &bpe)?
        } else {
            content
        };
        let request = CreateChatCompletionRequestArgs::default()
            .model("gpt-4o-mini")
            .temperature(0.0)
            .max_tokens(16200u32)
            .messages([
                ChatCompletionRequestSystemMessage::from(SYSTEM_PROMPT).into(),
                ChatCompletionRequestUserMessage::from(truncated_content).into(),
            ])
            .build()?;
        let response = openai_client.chat().create(request).await?;
        response
            .choices
            .first()
            .and_then(|choice| choice.message.content.as_ref())
            .map(|content| content.to_owned())
            .ok_or(AppError::LLMParsing("No content in response".into()))
    }
    fn truncate_content(
        content: &str,
        max_tokens: usize,
        tokenizer: &CoreBPE,
    ) -> Result<String, AppError> {
        // Pre-allocate with estimated size
        let mut result = String::with_capacity(content.len() / 2);
        let mut current_tokens = 0;
        // Process content by paragraph to maintain context
        for paragraph in content.split("\n\n") {
            let tokens = tokenizer.encode_with_special_tokens(paragraph).len();
            // Check if adding paragraph exceeds limit
            if current_tokens + tokens > max_tokens {
                break;
            }
            result.push_str(paragraph);
            result.push_str("\n\n");
            current_tokens += tokens;
        }
        // Ensure we return valid content
        if result.is_empty() {
            return Err(AppError::Processing("Content exceeds token limit".into()));
        }
        Ok(result.trim_end().to_string())
    }
    /// Extracts text from a file based on its MIME type.
    async fn extract_text_from_file(file_info: &FileInfo) -> Result<String, AppError> {
        match file_info.mime_type.as_str() {
            "text/plain" => {
                // Read the file and return its content
                let content = tokio::fs::read_to_string(&file_info.path).await?;
                Ok(content)
            }
            "text/markdown" => {
                // Read the file and return its content
                let content = tokio::fs::read_to_string(&file_info.path).await?;
                Ok(content)
            }
            "application/pdf" => {
                // TODO: Implement PDF text extraction using a crate like `pdf-extract` or `lopdf`
                Err(AppError::NotFound(file_info.mime_type.clone()))
            }
            "image/png" | "image/jpeg" => {
                // TODO: Implement OCR on image using a crate like `tesseract`
                Err(AppError::NotFound(file_info.mime_type.clone()))
            }
            "application/octet-stream" => {
                let content = tokio::fs::read_to_string(&file_info.path).await?;
                Ok(content)
            }
            "text/x-rust" => {
                let content = tokio::fs::read_to_string(&file_info.path).await?;
                Ok(content)
            }
            // Handle other MIME types as needed
            _ => Err(AppError::NotFound(file_info.mime_type.clone())),
        }
    }
 }
--- a/crates/common/src/ingress/mod.rs
+++ b/crates/common/src/ingress/mod.rs
@@ -1,3 +1,2 @@
 pub mod analysis;
 pub mod content_processor;
 pub mod ingress_object;
--- a/crates/common/src/storage/db.rs
+++ b/crates/common/src/storage/db.rs
@@ -1,6 +1,6 @@
 use crate::error::AppError;
-use super::types::{analytics::Analytics, job::Job, system_settings::SystemSettings, StoredObject};
+use super::types::{analytics::Analytics, system_settings::SystemSettings, StoredObject};
 use axum_session::{SessionConfig, SessionError, SessionStore};
 use axum_session_surreal::SessionSurrealPool;
 use futures::Stream;
@@ -171,9 +171,9 @@ impl SurrealDbClient {
    /// * `Result<Option<T>, Error>` - The deleted item or Error
    pub async fn listen<T>(
        &self,
-    ) -> Result<impl Stream<Item = Result<Notification<Job>, Error>>, Error>
+    ) -> Result<impl Stream<Item = Result<Notification<T>, Error>>, Error>
    where
-        T: for<'de> StoredObject,
+        T: for<'de> StoredObject + std::marker::Unpin,
    {
        self.client.select(T::table_name()).live().await
    }
--- a/crates/common/src/storage/types/ingestion_payload.rs
+++ b/crates/common/src/storage/types/ingestion_payload.rs
@@ -0,0 +1,95 @@
 use crate::{error::AppError, storage::types::file_info::FileInfo};
 use serde::{Deserialize, Serialize};
 use tracing::info;
 use url::Url;
 #[derive(Debug, Serialize, Deserialize, Clone)]
 pub enum IngestionPayload {
    Url {
        url: String,
        instructions: String,
        category: String,
        user_id: String,
    },
    Text {
        text: String,
        instructions: String,
        category: String,
        user_id: String,
    },
    File {
        file_info: FileInfo,
        instructions: String,
        category: String,
        user_id: String,
    },
 }
 impl IngestionPayload {
    /// Creates ingestion payloads from the provided content, instructions, and files.
    ///
    /// # Arguments
    /// * `content` - Optional textual content to be ingressed
    /// * `instructions` - Instructions for processing the ingress content
    /// * `category` - Category to classify the ingressed content
    /// * `files` - Vector of `FileInfo` objects containing information about uploaded files
    /// * `user_id` - Identifier of the user performing the ingress operation
    ///
    /// # Returns
    /// * `Result<Vec<IngestionPayload>, AppError>` - On success, returns a vector of ingress objects
    ///   (one per file/content type). On failure, returns an `AppError`.
    pub fn create_ingestion_payload(
        content: Option<String>,
        instructions: String,
        category: String,
        files: Vec<FileInfo>,
        user_id: &str,
    ) -> Result<Vec<IngestionPayload>, AppError> {
        // Initialize list
        let mut object_list = Vec::new();
        // Create a IngestionPayload from content if it exists, checking for URL or text
        if let Some(input_content) = content {
            match Url::parse(&input_content) {
                Ok(url) => {
                    info!("Detected URL: {}", url);
                    object_list.push(IngestionPayload::Url {
                        url: url.to_string(),
                        instructions: instructions.clone(),
                        category: category.clone(),
                        user_id: user_id.into(),
                    });
                }
                Err(_) => {
                    if input_content.len() > 2 {
                        info!("Treating input as plain text");
                        object_list.push(IngestionPayload::Text {
                            text: input_content.to_string(),
                            instructions: instructions.clone(),
                            category: category.clone(),
                            user_id: user_id.into(),
                        });
                    }
                }
            }
        }
        for file in files {
            object_list.push(IngestionPayload::File {
                file_info: file,
                instructions: instructions.clone(),
                category: category.clone(),
                user_id: user_id.into(),
            })
        }
        // If no objects are constructed, we return Err
        if object_list.is_empty() {
            return Err(AppError::NotFound(
                "No valid content or files provided".into(),
            ));
        }
        Ok(object_list)
    }
 }
--- a/crates/common/src/storage/types/ingestion_task.rs
+++ b/crates/common/src/storage/types/ingestion_task.rs
@@ -2,13 +2,12 @@ use futures::Stream;
 use surrealdb::{opt::PatchOp, Notification};
 use uuid::Uuid;
-use crate::{
+use crate::{error::AppError, storage::db::SurrealDbClient, stored_object};
-    error::AppError, ingress::ingress_object::IngressObject, storage::db::SurrealDbClient,
+
-    stored_object,
+use super::ingestion_payload::IngestionPayload;
 };
 #[derive(Debug, Clone, Serialize, Deserialize)]
-pub enum JobStatus {
+pub enum IngestionTaskStatus {
    Created,
    InProgress {
        attempts: u32,
@@ -19,22 +18,22 @@ pub enum JobStatus {
    Cancelled,
 }
-stored_object!(Job, "job", {
+stored_object!(IngestionTask, "job", {
-    content: IngressObject,
+    content: IngestionPayload,
-    status: JobStatus,
+    status: IngestionTaskStatus,
    user_id: String
 });
 pub const MAX_ATTEMPTS: u32 = 3;
-impl Job {
+impl IngestionTask {
-    pub async fn new(content: IngressObject, user_id: String) -> Self {
+    pub async fn new(content: IngestionPayload, user_id: String) -> Self {
        let now = Utc::now();
        Self {
            id: Uuid::new_v4().to_string(),
            content,
-            status: JobStatus::Created,
+            status: IngestionTaskStatus::Created,
            created_at: now,
            updated_at: now,
            user_id,
@@ -43,7 +42,7 @@ impl Job {
    /// Creates a new job and stores it in the database
    pub async fn create_and_add_to_db(
-        content: IngressObject,
+        content: IngestionPayload,
        user_id: String,
        db: &SurrealDbClient,
    ) -> Result<(), AppError> {
@@ -57,10 +56,10 @@ impl Job {
    // Update job status
    pub async fn update_status(
        id: &str,
-        status: JobStatus,
+        status: IngestionTaskStatus,
        db: &SurrealDbClient,
    ) -> Result<(), AppError> {
-        let _job: Option<Job> = db
+        let _job: Option<Self> = db
            .update((Self::table_name(), id))
            .patch(PatchOp::replace("/status", status))
            .patch(PatchOp::replace(
@@ -73,16 +72,16 @@ impl Job {
    }
    /// Listen for new jobs
-    pub async fn listen_for_jobs(
+    pub async fn listen_for_tasks(
        db: &SurrealDbClient,
-    ) -> Result<impl Stream<Item = Result<Notification<Job>, surrealdb::Error>>, surrealdb::Error>
+    ) -> Result<impl Stream<Item = Result<Notification<Self>, surrealdb::Error>>, surrealdb::Error>
    {
-        db.listen::<Job>().await
+        db.listen::<Self>().await
    }
-    /// Get all unfinished jobs, ie newly created and in progress up two times
+    /// Get all unfinished tasks, ie newly created and in progress up two times
-    pub async fn get_unfinished_jobs(db: &SurrealDbClient) -> Result<Vec<Job>, AppError> {
+    pub async fn get_unfinished_tasks(db: &SurrealDbClient) -> Result<Vec<Self>, AppError> {
-        let jobs: Vec<Job> = db
+        let jobs: Vec<Self> = db
            .query(
                "SELECT * FROM type::table($table) 
             WHERE 
--- a/crates/common/src/storage/types/mod.rs
+++ b/crates/common/src/storage/types/mod.rs
@@ -3,7 +3,8 @@ use serde::{Deserialize, Serialize};
 pub mod analytics;
 pub mod conversation;
 pub mod file_info;
-pub mod job;
+pub mod ingestion_payload;
 pub mod ingestion_task;
 pub mod knowledge_entity;
 pub mod knowledge_relationship;
 pub mod message;
--- a/crates/common/src/storage/types/user.rs
+++ b/crates/common/src/storage/types/user.rs
@@ -4,7 +4,7 @@ use surrealdb::{engine::any::Any, Surreal};
 use uuid::Uuid;
 use super::{
-    conversation::Conversation, job::Job, knowledge_entity::KnowledgeEntity,
+    conversation::Conversation, ingestion_task::IngestionTask, knowledge_entity::KnowledgeEntity,
    knowledge_relationship::KnowledgeRelationship, system_settings::SystemSettings,
    text_content::TextContent,
 };
@@ -351,12 +351,12 @@ impl User {
        Ok(conversations)
    }
-    /// Gets all active jobs for the specified user
+    /// Gets all active ingestion tasks for the specified user
-    pub async fn get_unfinished_jobs(
+    pub async fn get_unfinished_ingestion_tasks(
        user_id: &str,
        db: &SurrealDbClient,
-    ) -> Result<Vec<Job>, AppError> {
+    ) -> Result<Vec<IngestionTask>, AppError> {
-        let jobs: Vec<Job> = db
+        let jobs: Vec<IngestionTask> = db
            .query(
                "SELECT * FROM type::table($table) 
             WHERE user_id = $user_id 
@@ -369,7 +369,7 @@ impl User {
             )
             ORDER BY created_at DESC",
            )
-            .bind(("table", Job::table_name()))
+            .bind(("table", IngestionTask::table_name()))
            .bind(("user_id", user_id.to_owned()))
            .bind(("max_attempts", 3))
            .await?
@@ -384,12 +384,12 @@ impl User {
        user_id: &str,
        db: &SurrealDbClient,
    ) -> Result<(), AppError> {
-        db.get_item::<Job>(id)
+        db.get_item::<IngestionTask>(id)
            .await?
            .filter(|job| job.user_id == user_id)
            .ok_or_else(|| AppError::Auth("Not authorized to delete this job".into()))?;
-        db.delete_item::<Job>(id)
+        db.delete_item::<IngestionTask>(id)
            .await
            .map_err(AppError::Database)?;
--- a/crates/html-router/src/routes/index.rs
+++ b/crates/html-router/src/routes/index.rs
@@ -12,7 +12,7 @@ use tracing::info;
 use common::{
    error::{AppError, HtmlError},
    storage::types::{
-        file_info::FileInfo, job::Job, knowledge_entity::KnowledgeEntity,
+        file_info::FileInfo, ingestion_task::IngestionTask, knowledge_entity::KnowledgeEntity,
        knowledge_relationship::KnowledgeRelationship, text_chunk::TextChunk,
        text_content::TextContent, user::User,
    },
@@ -26,7 +26,7 @@ page_data!(IndexData, "index/index.html", {
    gdpr_accepted: bool,
    user: Option<User>,
    latest_text_contents: Vec<TextContent>,
-    active_jobs: Vec<Job>
+    active_jobs: Vec<IngestionTask>
 });
 pub async fn index_handler(
@@ -39,9 +39,11 @@ pub async fn index_handler(
    let gdpr_accepted = auth.current_user.is_some() | session.get("gdpr_accepted").unwrap_or(false);
    let active_jobs = match auth.current_user.is_some() {
-        true => User::get_unfinished_jobs(&auth.current_user.clone().unwrap().id, &state.db)
+        true => {
-            .await
+            User::get_unfinished_ingestion_tasks(&auth.current_user.clone().unwrap().id, &state.db)
-            .map_err(|e| HtmlError::new(e, state.templates.clone()))?,
+                .await
                .map_err(|e| HtmlError::new(e, state.templates.clone()))?
        }
        false => vec![],
    };
@@ -172,7 +174,7 @@ async fn get_and_validate_text_content(
 #[derive(Serialize)]
 pub struct ActiveJobsData {
-    pub active_jobs: Vec<Job>,
+    pub active_jobs: Vec<IngestionTask>,
    pub user: User,
 }
@@ -190,7 +192,7 @@ pub async fn delete_job(
        .await
        .map_err(|e| HtmlError::new(e, state.templates.clone()))?;
-    let active_jobs = User::get_unfinished_jobs(&user.id, &state.db)
+    let active_jobs = User::get_unfinished_ingestion_tasks(&user.id, &state.db)
        .await
        .map_err(|e| HtmlError::new(e, state.templates.clone()))?;
@@ -216,7 +218,7 @@ pub async fn show_active_jobs(
        None => return Ok(Redirect::to("/signin").into_response()),
    };
-    let active_jobs = User::get_unfinished_jobs(&user.id, &state.db)
+    let active_jobs = User::get_unfinished_ingestion_tasks(&user.id, &state.db)
        .await
        .map_err(|e| HtmlError::new(e, state.templates.clone()))?;
--- a/crates/html-router/src/routes/ingress_form.rs
+++ b/crates/html-router/src/routes/ingress_form.rs
@@ -12,8 +12,10 @@ use tracing::info;
 use common::{
    error::{AppError, HtmlError, IntoHtmlError},
-    ingress::ingress_object::IngressObject,
+    storage::types::{
-    storage::types::{file_info::FileInfo, job::Job, user::User},
+        file_info::FileInfo, ingestion_payload::IngestionPayload, ingestion_task::IngestionTask,
        user::User,
    },
 };
 use crate::{
@@ -112,7 +114,7 @@ pub async fn process_ingress_form(
    }))
    .await?;
-    let ingress_objects = IngressObject::create_ingress_objects(
+    let payloads = IngestionPayload::create_ingestion_payload(
        input.content,
        input.instructions,
        input.category,
@@ -121,9 +123,11 @@ pub async fn process_ingress_form(
    )
    .map_err(|e| HtmlError::new(e, state.templates.clone()))?;
-    let futures: Vec<_> = ingress_objects
+    let futures: Vec<_> = payloads
        .into_iter()
-        .map(|object| Job::create_and_add_to_db(object.clone(), user.id.clone(), &state.db))
+        .map(|object| {
            IngestionTask::create_and_add_to_db(object.clone(), user.id.clone(), &state.db)
        })
        .collect();
    try_join_all(futures)
@@ -132,7 +136,7 @@ pub async fn process_ingress_form(
        .map_err(|e| HtmlError::new(e, state.templates.clone()))?;
    // Update the active jobs page with the newly created job
-    let active_jobs = User::get_unfinished_jobs(&user.id, &state.db)
+    let active_jobs = User::get_unfinished_ingestion_tasks(&user.id, &state.db)
        .await
        .map_err(|e| HtmlError::new(e, state.templates.clone()))?;
--- a/crates/ingestion-pipeline/Cargo.toml
+++ b/crates/ingestion-pipeline/Cargo.toml
@@ -0,0 +1,21 @@
 [package]
 name = "ingestion-pipeline"
 version = "0.1.0"
 edition = "2021"
 [dependencies]
 # Workspace dependencies
 tokio = { workspace = true }
 serde = { workspace = true }
 axum = { workspace = true }
 tracing = { workspace = true }
 serde_json = { workspace = true }
 async-openai = "0.24.1"
 tiktoken-rs = "0.6.0"
 reqwest = {version = "0.12.12", features = ["charset", "json"]}
 scraper = "0.22.0"
 chrono = { version = "0.4.39", features = ["serde"] }
 text-splitter = "0.18.1"
 common = { path = "../common" }
--- a/crates/ingestion-pipeline/src/lib.rs
+++ b/crates/ingestion-pipeline/src/lib.rs
@@ -0,0 +1,2 @@
 pub mod pipeline;
 pub mod types;
--- a/crates/ingestion-pipeline/src/pipeline.rs
+++ b/crates/ingestion-pipeline/src/pipeline.rs
@@ -0,0 +1,165 @@
 use std::{sync::Arc, time::Instant};
 use chrono::Utc;
 use text_splitter::TextSplitter;
 use tracing::{debug, info};
 use common::{
    error::AppError,
    storage::{
        db::SurrealDbClient,
        types::{
            ingestion_task::{IngestionTask, IngestionTaskStatus, MAX_ATTEMPTS},
            knowledge_entity::KnowledgeEntity,
            knowledge_relationship::KnowledgeRelationship,
            text_chunk::TextChunk,
            text_content::TextContent,
        },
    },
    utils::embedding::generate_embedding,
 };
 use common::ingress::analysis::{
    ingress_analyser::IngressAnalyzer, types::llm_analysis_result::LLMGraphAnalysisResult,
 };
 use crate::types::to_text_content;
 pub struct IngestionPipeline {
    db: Arc<SurrealDbClient>,
    openai_client: Arc<async_openai::Client<async_openai::config::OpenAIConfig>>,
 }
 impl IngestionPipeline {
    pub async fn new(
        db: Arc<SurrealDbClient>,
        openai_client: Arc<async_openai::Client<async_openai::config::OpenAIConfig>>,
    ) -> Result<Self, AppError> {
        Ok(Self { db, openai_client })
    }
    pub async fn process_task(&self, task: IngestionTask) -> Result<(), AppError> {
        let current_attempts = match task.status {
            IngestionTaskStatus::InProgress { attempts, .. } => attempts + 1,
            _ => 1,
        };
        // Update status to InProgress with attempt count
        IngestionTask::update_status(
            &task.id,
            IngestionTaskStatus::InProgress {
                attempts: current_attempts,
                last_attempt: Utc::now(),
            },
            &self.db,
        )
        .await?;
        let text_content = to_text_content(task.content, &self.openai_client).await?;
        match self.process(&text_content).await {
            Ok(_) => {
                IngestionTask::update_status(&task.id, IngestionTaskStatus::Completed, &self.db)
                    .await?;
                Ok(())
            }
            Err(e) => {
                if current_attempts >= MAX_ATTEMPTS {
                    IngestionTask::update_status(
                        &task.id,
                        IngestionTaskStatus::Error(format!("Max attempts reached: {}", e)),
                        &self.db,
                    )
                    .await?;
                }
                Err(AppError::Processing(e.to_string()))
            }
        }
    }
    pub async fn process(&self, content: &TextContent) -> Result<(), AppError> {
        let now = Instant::now();
        // Perform analyis, this step also includes retrieval
        let analysis = self.perform_semantic_analysis(content).await?;
        let end = now.elapsed();
        info!(
            "{:?} time elapsed during creation of entities and relationships",
            end
        );
        // Convert analysis to application objects
        let (entities, relationships) = analysis
            .to_database_entities(&content.id, &content.user_id, &self.openai_client)
            .await?;
        // Store everything
        tokio::try_join!(
            self.store_graph_entities(entities, relationships),
            self.store_vector_chunks(content),
        )?;
        // Store original content
        self.db.store_item(content.to_owned()).await?;
        self.db.rebuild_indexes().await?;
        Ok(())
    }
    async fn perform_semantic_analysis(
        &self,
        content: &TextContent,
    ) -> Result<LLMGraphAnalysisResult, AppError> {
        let analyser = IngressAnalyzer::new(&self.db, &self.openai_client);
        analyser
            .analyze_content(
                &content.category,
                &content.instructions,
                &content.text,
                &content.user_id,
            )
            .await
    }
    async fn store_graph_entities(
        &self,
        entities: Vec<KnowledgeEntity>,
        relationships: Vec<KnowledgeRelationship>,
    ) -> Result<(), AppError> {
        for entity in &entities {
            debug!("Storing entity: {:?}", entity);
            self.db.store_item(entity.clone()).await?;
        }
        for relationship in &relationships {
            debug!("Storing relationship: {:?}", relationship);
            relationship.store_relationship(&self.db).await?;
        }
        info!(
            "Stored {} entities and {} relationships",
            entities.len(),
            relationships.len()
        );
        Ok(())
    }
    async fn store_vector_chunks(&self, content: &TextContent) -> Result<(), AppError> {
        let splitter = TextSplitter::new(500..2000);
        let chunks = splitter.chunks(&content.text);
        // Could potentially process chunks in parallel with a bounded concurrent limit
        for chunk in chunks {
            let embedding = generate_embedding(&self.openai_client, chunk).await?;
            let text_chunk = TextChunk::new(
                content.id.to_string(),
                chunk.to_string(),
                embedding,
                content.user_id.to_string(),
            );
            self.db.store_item(text_chunk).await?;
        }
        Ok(())
    }
 }
--- a/crates/ingestion-pipeline/src/types/mod.rs
+++ b/crates/ingestion-pipeline/src/types/mod.rs
@@ -0,0 +1,247 @@
 use std::{sync::Arc, time::Duration};
 use async_openai::types::{
    ChatCompletionRequestSystemMessage, ChatCompletionRequestUserMessage,
    CreateChatCompletionRequestArgs,
 };
 use common::{
    error::AppError,
    storage::types::{
        file_info::FileInfo, ingestion_payload::IngestionPayload, text_content::TextContent,
    },
 };
 use reqwest;
 use scraper::{Html, Selector};
 use std::fmt::Write;
 use tiktoken_rs::{o200k_base, CoreBPE};
 pub async fn to_text_content(
    ingestion_payload: IngestionPayload,
    openai_client: &Arc<async_openai::Client<async_openai::config::OpenAIConfig>>,
 ) -> Result<TextContent, AppError> {
    match ingestion_payload {
        IngestionPayload::Url {
            url,
            instructions,
            category,
            user_id,
        } => {
            let text = fetch_text_from_url(&url, openai_client).await?;
            Ok(TextContent::new(
                text,
                instructions.into(),
                category.into(),
                None,
                Some(url.into()),
                user_id.into(),
            ))
        }
        IngestionPayload::Text {
            text,
            instructions,
            category,
            user_id,
        } => Ok(TextContent::new(
            text.into(),
            instructions.into(),
            category.into(),
            None,
            None,
            user_id.into(),
        )),
        IngestionPayload::File {
            file_info,
            instructions,
            category,
            user_id,
        } => {
            let text = extract_text_from_file(&file_info).await?;
            Ok(TextContent::new(
                text,
                instructions.into(),
                category.into(),
                Some(file_info.to_owned()),
                None,
                user_id.into(),
            ))
        }
    }
 }
 /// Get text from url, will return it as a markdown formatted string
 async fn fetch_text_from_url(
    url: &str,
    openai_client: &Arc<async_openai::Client<async_openai::config::OpenAIConfig>>,
 ) -> Result<String, AppError> {
    // Use a client with timeouts and reuse
    let client = reqwest::ClientBuilder::new()
        .timeout(Duration::from_secs(30))
        .build()?;
    let response = client.get(url).send().await?.text().await?;
    // Preallocate string with capacity
    let mut structured_content = String::with_capacity(response.len() / 2);
    let document = Html::parse_document(&response);
    let main_selectors = Selector::parse(
        "article, main, .article-content, .post-content, .entry-content, [role='main']",
    )
    .unwrap();
    let content_element = document
        .select(&main_selectors)
        .next()
        .or_else(|| document.select(&Selector::parse("body").unwrap()).next())
        .ok_or(AppError::NotFound("No content found".into()))?;
    // Compile selectors once
    let heading_selector = Selector::parse("h1, h2, h3").unwrap();
    let paragraph_selector = Selector::parse("p").unwrap();
    // Process content in one pass
    for element in content_element.select(&heading_selector) {
        let _ = writeln!(
            structured_content,
            "<heading>{}</heading>",
            element.text().collect::<String>().trim()
        );
    }
    for element in content_element.select(&paragraph_selector) {
        let _ = writeln!(
            structured_content,
            "<paragraph>{}</paragraph>",
            element.text().collect::<String>().trim()
        );
    }
    let content = structured_content
        .replace(|c: char| c.is_control(), " ")
        .replace("  ", " ");
    process_web_content(content, openai_client).await
 }
 pub async fn process_web_content(
    content: String,
    openai_client: &Arc<async_openai::Client<async_openai::config::OpenAIConfig>>,
 ) -> Result<String, AppError> {
    const MAX_TOKENS: usize = 122000;
    const SYSTEM_PROMPT: &str = r#"
        You are a precise content extractor for web pages. Your task:
        1. Extract ONLY the main article/content from the provided text
        2. Maintain the original content - do not summarize or modify the core information
        3. Ignore peripheral content such as:
            - Navigation elements
            - Error messages (e.g., "JavaScript required")
            - Related articles sections
            - Comments
            - Social media links
            - Advertisement text
        FORMAT:
        - Convert <heading> tags to markdown headings (#, ##, ###)
        - Convert <paragraph> tags to markdown paragraphs
        - Preserve quotes and important formatting
        - Remove duplicate content
        - Remove any metadata or technical artifacts
        OUTPUT RULES:
        - Output ONLY the cleaned content in markdown
        - Do not add any explanations or meta-commentary
        - Do not add summaries or conclusions
        - Do not use any XML/HTML tags in the output
    "#;
    let bpe = o200k_base()?;
    // Process content in chunks if needed
    let truncated_content = if bpe.encode_with_special_tokens(&content).len() > MAX_TOKENS {
        truncate_content(&content, MAX_TOKENS, &bpe)?
    } else {
        content
    };
    let request = CreateChatCompletionRequestArgs::default()
        .model("gpt-4o-mini")
        .temperature(0.0)
        .max_tokens(16200u32)
        .messages([
            ChatCompletionRequestSystemMessage::from(SYSTEM_PROMPT).into(),
            ChatCompletionRequestUserMessage::from(truncated_content).into(),
        ])
        .build()?;
    let response = openai_client.chat().create(request).await?;
    response
        .choices
        .first()
        .and_then(|choice| choice.message.content.as_ref())
        .map(|content| content.to_owned())
        .ok_or(AppError::LLMParsing("No content in response".into()))
 }
 fn truncate_content(
    content: &str,
    max_tokens: usize,
    tokenizer: &CoreBPE,
 ) -> Result<String, AppError> {
    // Pre-allocate with estimated size
    let mut result = String::with_capacity(content.len() / 2);
    let mut current_tokens = 0;
    // Process content by paragraph to maintain context
    for paragraph in content.split("\n\n") {
        let tokens = tokenizer.encode_with_special_tokens(paragraph).len();
        // Check if adding paragraph exceeds limit
        if current_tokens + tokens > max_tokens {
            break;
        }
        result.push_str(paragraph);
        result.push_str("\n\n");
        current_tokens += tokens;
    }
    // Ensure we return valid content
    if result.is_empty() {
        return Err(AppError::Processing("Content exceeds token limit".into()));
    }
    Ok(result.trim_end().to_string())
 }
 /// Extracts text from a file based on its MIME type.
 async fn extract_text_from_file(file_info: &FileInfo) -> Result<String, AppError> {
    match file_info.mime_type.as_str() {
        "text/plain" => {
            // Read the file and return its content
            let content = tokio::fs::read_to_string(&file_info.path).await?;
            Ok(content)
        }
        "text/markdown" => {
            // Read the file and return its content
            let content = tokio::fs::read_to_string(&file_info.path).await?;
            Ok(content)
        }
        "application/pdf" => {
            // TODO: Implement PDF text extraction using a crate like `pdf-extract` or `lopdf`
            Err(AppError::NotFound(file_info.mime_type.clone()))
        }
        "image/png" | "image/jpeg" => {
            // TODO: Implement OCR on image using a crate like `tesseract`
            Err(AppError::NotFound(file_info.mime_type.clone()))
        }
        "application/octet-stream" => {
            let content = tokio::fs::read_to_string(&file_info.path).await?;
            Ok(content)
        }
        "text/x-rust" => {
            let content = tokio::fs::read_to_string(&file_info.path).await?;
            Ok(content)
        }
        // Handle other MIME types as needed
        _ => Err(AppError::NotFound(file_info.mime_type.clone())),
    }
 }
--- a/crates/main/Cargo.toml
+++ b/crates/main/Cargo.toml
@@ -45,6 +45,7 @@ url = { version = "2.5.2", features = ["serde"] }
 uuid = { version = "1.10.0", features = ["v4", "serde"] }
 # Reference to api-router
 ingestion-pipeline = { path = "../ingestion-pipeline" }
 api-router = { path = "../api-router" }
 html-router = { path = "../html-router" }
 common = { path = "../common" }
--- a/crates/main/src/worker.rs
+++ b/crates/main/src/worker.rs
@@ -1,14 +1,14 @@
 use std::sync::Arc;
 use common::{
    ingress::content_processor::ContentProcessor,
    storage::{
        db::SurrealDbClient,
-        types::job::{Job, JobStatus},
+        types::ingestion_task::{IngestionTask, IngestionTaskStatus},
    },
    utils::config::get_config,
 };
 use futures::StreamExt;
 use ingestion_pipeline::pipeline::IngestionPipeline;
 use surrealdb::Action;
 use tracing::{error, info};
 use tracing_subscriber::{fmt, prelude::*, EnvFilter};
@@ -37,23 +37,23 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
    let openai_client = Arc::new(async_openai::Client::new());
-    let content_processor = ContentProcessor::new(db.clone(), openai_client.clone()).await?;
+    let ingestion_pipeline = IngestionPipeline::new(db.clone(), openai_client.clone()).await?;
    loop {
-        // First, check for any unfinished jobs
+        // First, check for any unfinished tasks
-        let unfinished_jobs = Job::get_unfinished_jobs(&db).await?;
+        let unfinished_tasks = IngestionTask::get_unfinished_tasks(&db).await?;
-        if !unfinished_jobs.is_empty() {
+        if !unfinished_tasks.is_empty() {
-            info!("Found {} unfinished jobs", unfinished_jobs.len());
+            info!("Found {} unfinished jobs", unfinished_tasks.len());
-            for job in unfinished_jobs {
+            for task in unfinished_tasks {
-                content_processor.process_job(job).await?;
+                ingestion_pipeline.process_task(task).await?;
            }
        }
        // If no unfinished jobs, start listening for new ones
        info!("Listening for new jobs...");
-        let mut job_stream = Job::listen_for_jobs(&db).await?;
+        let mut job_stream = IngestionTask::listen_for_tasks(&db).await?;
        while let Some(notification) = job_stream.next().await {
            match notification {
@@ -62,41 +62,42 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
                    match notification.action {
                        Action::Create => {
-                            if let Err(e) = content_processor.process_job(notification.data).await {
+                            if let Err(e) = ingestion_pipeline.process_task(notification.data).await
-                                error!("Error processing job: {}", e);
+                            {
                                error!("Error processing task: {}", e);
                            }
                        }
                        Action::Update => {
                            match notification.data.status {
-                                JobStatus::Completed
+                                IngestionTaskStatus::Completed
-                                | JobStatus::Error(_)
+                                | IngestionTaskStatus::Error(_)
-                                | JobStatus::Cancelled => {
+                                | IngestionTaskStatus::Cancelled => {
                                    info!(
-                                        "Skipping already completed/error/cancelled job: {}",
+                                        "Skipping already completed/error/cancelled task: {}",
                                        notification.data.id
                                    );
                                    continue;
                                }
-                                JobStatus::InProgress { attempts, .. } => {
+                                IngestionTaskStatus::InProgress { attempts, .. } => {
                                    // Only process if this is a retry after an error, not our own update
-                                    if let Ok(Some(current_job)) =
+                                    if let Ok(Some(current_task)) =
-                                        db.get_item::<Job>(&notification.data.id).await
+                                        db.get_item::<IngestionTask>(&notification.data.id).await
                                    {
-                                        match current_job.status {
+                                        match current_task.status {
-                                            JobStatus::Error(_)
+                                            IngestionTaskStatus::Error(_)
                                                if attempts
-                                                    < common::storage::types::job::MAX_ATTEMPTS =>
+                                                    < common::storage::types::ingestion_task::MAX_ATTEMPTS =>
                                            {
                                                // This is a retry after an error
                                                if let Err(e) =
-                                                    content_processor.process_job(current_job).await
+                                                    ingestion_pipeline.process_task(current_task).await
                                                {
-                                                    error!("Error processing job retry: {}", e);
+                                                    error!("Error processing task retry: {}", e);
                                                }
                                            }
                                            _ => {
                                                info!(
-                                                    "Skipping in-progress update for job: {}",
+                                                    "Skipping in-progress update for task: {}",
                                                    notification.data.id
                                                );
                                                continue;
@@ -104,12 +105,12 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
                                        }
                                    }
                                }
-                                JobStatus::Created => {
+                                IngestionTaskStatus::Created => {
                                    // Shouldn't happen with Update action, but process if it does
                                    if let Err(e) =
-                                        content_processor.process_job(notification.data).await
+                                        ingestion_pipeline.process_task(notification.data).await
                                    {
-                                        error!("Error processing job: {}", e);
+                                        error!("Error processing task: {}", e);
                                    }
                                }
                            }
@@ -122,7 +123,7 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
        }
        // If we reach here, the stream has ended (connection lost?)
-        error!("Job stream ended unexpectedly, reconnecting...");
+        error!("Database stream ended unexpectedly, reconnecting...");
        tokio::time::sleep(tokio::time::Duration::from_secs(5)).await;
    }
 }