mirror of
https://github.com/perstarkse/minne.git
synced 2026-04-20 16:01:22 +02:00
ingestion-pipeline crated init, begun moving
This commit is contained in:
@@ -1,162 +1 @@
|
||||
use std::{sync::Arc, time::Instant};
|
||||
|
||||
use chrono::Utc;
|
||||
use text_splitter::TextSplitter;
|
||||
use tracing::{debug, info};
|
||||
|
||||
use crate::{
|
||||
error::AppError,
|
||||
storage::{
|
||||
db::SurrealDbClient,
|
||||
types::{
|
||||
job::{Job, JobStatus, MAX_ATTEMPTS},
|
||||
knowledge_entity::KnowledgeEntity,
|
||||
knowledge_relationship::KnowledgeRelationship,
|
||||
text_chunk::TextChunk,
|
||||
text_content::TextContent,
|
||||
},
|
||||
},
|
||||
utils::embedding::generate_embedding,
|
||||
};
|
||||
|
||||
use super::analysis::{
|
||||
ingress_analyser::IngressAnalyzer, types::llm_analysis_result::LLMGraphAnalysisResult,
|
||||
};
|
||||
|
||||
pub struct ContentProcessor {
|
||||
db: Arc<SurrealDbClient>,
|
||||
openai_client: Arc<async_openai::Client<async_openai::config::OpenAIConfig>>,
|
||||
}
|
||||
|
||||
impl ContentProcessor {
|
||||
pub async fn new(
|
||||
db: Arc<SurrealDbClient>,
|
||||
openai_client: Arc<async_openai::Client<async_openai::config::OpenAIConfig>>,
|
||||
) -> Result<Self, AppError> {
|
||||
Ok(Self { db, openai_client })
|
||||
}
|
||||
pub async fn process_job(&self, job: Job) -> Result<(), AppError> {
|
||||
let current_attempts = match job.status {
|
||||
JobStatus::InProgress { attempts, .. } => attempts + 1,
|
||||
_ => 1,
|
||||
};
|
||||
|
||||
// Update status to InProgress with attempt count
|
||||
Job::update_status(
|
||||
&job.id,
|
||||
JobStatus::InProgress {
|
||||
attempts: current_attempts,
|
||||
last_attempt: Utc::now(),
|
||||
},
|
||||
&self.db,
|
||||
)
|
||||
.await?;
|
||||
|
||||
let text_content = job.content.to_text_content(&self.openai_client).await?;
|
||||
|
||||
match self.process(&text_content).await {
|
||||
Ok(_) => {
|
||||
Job::update_status(&job.id, JobStatus::Completed, &self.db).await?;
|
||||
Ok(())
|
||||
}
|
||||
Err(e) => {
|
||||
if current_attempts >= MAX_ATTEMPTS {
|
||||
Job::update_status(
|
||||
&job.id,
|
||||
JobStatus::Error(format!("Max attempts reached: {}", e)),
|
||||
&self.db,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
Err(AppError::Processing(e.to_string()))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn process(&self, content: &TextContent) -> Result<(), AppError> {
|
||||
let now = Instant::now();
|
||||
|
||||
// Perform analyis, this step also includes retrieval
|
||||
let analysis = self.perform_semantic_analysis(content).await?;
|
||||
|
||||
let end = now.elapsed();
|
||||
info!(
|
||||
"{:?} time elapsed during creation of entities and relationships",
|
||||
end
|
||||
);
|
||||
|
||||
// Convert analysis to objects
|
||||
let (entities, relationships) = analysis
|
||||
.to_database_entities(&content.id, &content.user_id, &self.openai_client)
|
||||
.await?;
|
||||
|
||||
// Store everything
|
||||
tokio::try_join!(
|
||||
self.store_graph_entities(entities, relationships),
|
||||
self.store_vector_chunks(content),
|
||||
)?;
|
||||
|
||||
// Store original content
|
||||
self.db.store_item(content.to_owned()).await?;
|
||||
|
||||
self.db.rebuild_indexes().await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn perform_semantic_analysis(
|
||||
&self,
|
||||
content: &TextContent,
|
||||
) -> Result<LLMGraphAnalysisResult, AppError> {
|
||||
let analyser = IngressAnalyzer::new(&self.db, &self.openai_client);
|
||||
analyser
|
||||
.analyze_content(
|
||||
&content.category,
|
||||
&content.instructions,
|
||||
&content.text,
|
||||
&content.user_id,
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
async fn store_graph_entities(
|
||||
&self,
|
||||
entities: Vec<KnowledgeEntity>,
|
||||
relationships: Vec<KnowledgeRelationship>,
|
||||
) -> Result<(), AppError> {
|
||||
for entity in &entities {
|
||||
debug!("Storing entity: {:?}", entity);
|
||||
self.db.store_item(entity.clone()).await?;
|
||||
}
|
||||
|
||||
for relationship in &relationships {
|
||||
debug!("Storing relationship: {:?}", relationship);
|
||||
relationship.store_relationship(&self.db).await?;
|
||||
}
|
||||
|
||||
info!(
|
||||
"Stored {} entities and {} relationships",
|
||||
entities.len(),
|
||||
relationships.len()
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn store_vector_chunks(&self, content: &TextContent) -> Result<(), AppError> {
|
||||
let splitter = TextSplitter::new(500..2000);
|
||||
let chunks = splitter.chunks(&content.text);
|
||||
|
||||
// Could potentially process chunks in parallel with a bounded concurrent limit
|
||||
for chunk in chunks {
|
||||
let embedding = generate_embedding(&self.openai_client, chunk).await?;
|
||||
let text_chunk = TextChunk::new(
|
||||
content.id.to_string(),
|
||||
chunk.to_string(),
|
||||
embedding,
|
||||
content.user_id.to_string(),
|
||||
);
|
||||
self.db.store_item(text_chunk).await?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,345 +0,0 @@
|
||||
use std::{sync::Arc, time::Duration};
|
||||
|
||||
use crate::{
|
||||
error::AppError,
|
||||
storage::types::{file_info::FileInfo, text_content::TextContent},
|
||||
};
|
||||
use async_openai::types::{
|
||||
ChatCompletionRequestSystemMessage, ChatCompletionRequestUserMessage,
|
||||
CreateChatCompletionRequestArgs,
|
||||
};
|
||||
use reqwest;
|
||||
use scraper::{Html, Selector};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::fmt::Write;
|
||||
use tiktoken_rs::{o200k_base, CoreBPE};
|
||||
use tracing::info;
|
||||
use url::Url;
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize, Clone)]
|
||||
pub enum IngressObject {
|
||||
Url {
|
||||
url: String,
|
||||
instructions: String,
|
||||
category: String,
|
||||
user_id: String,
|
||||
},
|
||||
Text {
|
||||
text: String,
|
||||
instructions: String,
|
||||
category: String,
|
||||
user_id: String,
|
||||
},
|
||||
File {
|
||||
file_info: FileInfo,
|
||||
instructions: String,
|
||||
category: String,
|
||||
user_id: String,
|
||||
},
|
||||
}
|
||||
|
||||
impl IngressObject {
|
||||
/// Creates ingress objects from the provided content, instructions, and files.
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `content` - Optional textual content to be ingressed
|
||||
/// * `instructions` - Instructions for processing the ingress content
|
||||
/// * `category` - Category to classify the ingressed content
|
||||
/// * `files` - Vector of `FileInfo` objects containing information about uploaded files
|
||||
/// * `user_id` - Identifier of the user performing the ingress operation
|
||||
///
|
||||
/// # Returns
|
||||
/// * `Result<Vec<IngressObject>, AppError>` - On success, returns a vector of ingress objects
|
||||
/// (one per file/content type). On failure, returns an `AppError`.
|
||||
pub fn create_ingress_objects(
|
||||
content: Option<String>,
|
||||
instructions: String,
|
||||
category: String,
|
||||
files: Vec<FileInfo>,
|
||||
user_id: &str,
|
||||
) -> Result<Vec<IngressObject>, AppError> {
|
||||
// Initialize list
|
||||
let mut object_list = Vec::new();
|
||||
|
||||
// Create a IngressObject from content if it exists, checking for URL or text
|
||||
if let Some(input_content) = content {
|
||||
match Url::parse(&input_content) {
|
||||
Ok(url) => {
|
||||
info!("Detected URL: {}", url);
|
||||
object_list.push(IngressObject::Url {
|
||||
url: url.to_string(),
|
||||
instructions: instructions.clone(),
|
||||
category: category.clone(),
|
||||
user_id: user_id.into(),
|
||||
});
|
||||
}
|
||||
Err(_) => {
|
||||
if input_content.len() > 2 {
|
||||
info!("Treating input as plain text");
|
||||
object_list.push(IngressObject::Text {
|
||||
text: input_content.to_string(),
|
||||
instructions: instructions.clone(),
|
||||
category: category.clone(),
|
||||
user_id: user_id.into(),
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for file in files {
|
||||
object_list.push(IngressObject::File {
|
||||
file_info: file,
|
||||
instructions: instructions.clone(),
|
||||
category: category.clone(),
|
||||
user_id: user_id.into(),
|
||||
})
|
||||
}
|
||||
|
||||
// If no objects are constructed, we return Err
|
||||
if object_list.is_empty() {
|
||||
return Err(AppError::NotFound(
|
||||
"No valid content or files provided".into(),
|
||||
));
|
||||
}
|
||||
|
||||
Ok(object_list)
|
||||
}
|
||||
/// Creates a new `TextContent` instance from a `IngressObject`.
|
||||
///
|
||||
/// # Arguments
|
||||
/// `&self` - A reference to the `IngressObject`.
|
||||
///
|
||||
/// # Returns
|
||||
/// `TextContent` - An object containing a text representation of the object, could be a scraped URL, parsed PDF, etc.
|
||||
pub async fn to_text_content(
|
||||
&self,
|
||||
openai_client: &Arc<async_openai::Client<async_openai::config::OpenAIConfig>>,
|
||||
) -> Result<TextContent, AppError> {
|
||||
match self {
|
||||
IngressObject::Url {
|
||||
url,
|
||||
instructions,
|
||||
category,
|
||||
user_id,
|
||||
} => {
|
||||
let text = Self::fetch_text_from_url(url, openai_client).await?;
|
||||
Ok(TextContent::new(
|
||||
text,
|
||||
instructions.into(),
|
||||
category.into(),
|
||||
None,
|
||||
Some(url.into()),
|
||||
user_id.into(),
|
||||
))
|
||||
}
|
||||
IngressObject::Text {
|
||||
text,
|
||||
instructions,
|
||||
category,
|
||||
user_id,
|
||||
} => Ok(TextContent::new(
|
||||
text.into(),
|
||||
instructions.into(),
|
||||
category.into(),
|
||||
None,
|
||||
None,
|
||||
user_id.into(),
|
||||
)),
|
||||
IngressObject::File {
|
||||
file_info,
|
||||
instructions,
|
||||
category,
|
||||
user_id,
|
||||
} => {
|
||||
let text = Self::extract_text_from_file(file_info).await?;
|
||||
Ok(TextContent::new(
|
||||
text,
|
||||
instructions.into(),
|
||||
category.into(),
|
||||
Some(file_info.to_owned()),
|
||||
None,
|
||||
user_id.into(),
|
||||
))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Get text from url, will return it as a markdown formatted string
|
||||
async fn fetch_text_from_url(
|
||||
url: &str,
|
||||
openai_client: &Arc<async_openai::Client<async_openai::config::OpenAIConfig>>,
|
||||
) -> Result<String, AppError> {
|
||||
// Use a client with timeouts and reuse
|
||||
let client = reqwest::ClientBuilder::new()
|
||||
.timeout(Duration::from_secs(30))
|
||||
.build()?;
|
||||
let response = client.get(url).send().await?.text().await?;
|
||||
|
||||
// Preallocate string with capacity
|
||||
let mut structured_content = String::with_capacity(response.len() / 2);
|
||||
|
||||
let document = Html::parse_document(&response);
|
||||
let main_selectors = Selector::parse(
|
||||
"article, main, .article-content, .post-content, .entry-content, [role='main']",
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let content_element = document
|
||||
.select(&main_selectors)
|
||||
.next()
|
||||
.or_else(|| document.select(&Selector::parse("body").unwrap()).next())
|
||||
.ok_or(AppError::NotFound("No content found".into()))?;
|
||||
|
||||
// Compile selectors once
|
||||
let heading_selector = Selector::parse("h1, h2, h3").unwrap();
|
||||
let paragraph_selector = Selector::parse("p").unwrap();
|
||||
|
||||
// Process content in one pass
|
||||
for element in content_element.select(&heading_selector) {
|
||||
let _ = writeln!(
|
||||
structured_content,
|
||||
"<heading>{}</heading>",
|
||||
element.text().collect::<String>().trim()
|
||||
);
|
||||
}
|
||||
for element in content_element.select(¶graph_selector) {
|
||||
let _ = writeln!(
|
||||
structured_content,
|
||||
"<paragraph>{}</paragraph>",
|
||||
element.text().collect::<String>().trim()
|
||||
);
|
||||
}
|
||||
|
||||
let content = structured_content
|
||||
.replace(|c: char| c.is_control(), " ")
|
||||
.replace(" ", " ");
|
||||
Self::process_web_content(content, openai_client).await
|
||||
}
|
||||
|
||||
pub async fn process_web_content(
|
||||
content: String,
|
||||
openai_client: &Arc<async_openai::Client<async_openai::config::OpenAIConfig>>,
|
||||
) -> Result<String, AppError> {
|
||||
const MAX_TOKENS: usize = 122000;
|
||||
const SYSTEM_PROMPT: &str = r#"
|
||||
You are a precise content extractor for web pages. Your task:
|
||||
|
||||
1. Extract ONLY the main article/content from the provided text
|
||||
2. Maintain the original content - do not summarize or modify the core information
|
||||
3. Ignore peripheral content such as:
|
||||
- Navigation elements
|
||||
- Error messages (e.g., "JavaScript required")
|
||||
- Related articles sections
|
||||
- Comments
|
||||
- Social media links
|
||||
- Advertisement text
|
||||
|
||||
FORMAT:
|
||||
- Convert <heading> tags to markdown headings (#, ##, ###)
|
||||
- Convert <paragraph> tags to markdown paragraphs
|
||||
- Preserve quotes and important formatting
|
||||
- Remove duplicate content
|
||||
- Remove any metadata or technical artifacts
|
||||
|
||||
OUTPUT RULES:
|
||||
- Output ONLY the cleaned content in markdown
|
||||
- Do not add any explanations or meta-commentary
|
||||
- Do not add summaries or conclusions
|
||||
- Do not use any XML/HTML tags in the output
|
||||
"#;
|
||||
|
||||
let bpe = o200k_base()?;
|
||||
|
||||
// Process content in chunks if needed
|
||||
let truncated_content = if bpe.encode_with_special_tokens(&content).len() > MAX_TOKENS {
|
||||
Self::truncate_content(&content, MAX_TOKENS, &bpe)?
|
||||
} else {
|
||||
content
|
||||
};
|
||||
|
||||
let request = CreateChatCompletionRequestArgs::default()
|
||||
.model("gpt-4o-mini")
|
||||
.temperature(0.0)
|
||||
.max_tokens(16200u32)
|
||||
.messages([
|
||||
ChatCompletionRequestSystemMessage::from(SYSTEM_PROMPT).into(),
|
||||
ChatCompletionRequestUserMessage::from(truncated_content).into(),
|
||||
])
|
||||
.build()?;
|
||||
|
||||
let response = openai_client.chat().create(request).await?;
|
||||
|
||||
response
|
||||
.choices
|
||||
.first()
|
||||
.and_then(|choice| choice.message.content.as_ref())
|
||||
.map(|content| content.to_owned())
|
||||
.ok_or(AppError::LLMParsing("No content in response".into()))
|
||||
}
|
||||
|
||||
fn truncate_content(
|
||||
content: &str,
|
||||
max_tokens: usize,
|
||||
tokenizer: &CoreBPE,
|
||||
) -> Result<String, AppError> {
|
||||
// Pre-allocate with estimated size
|
||||
let mut result = String::with_capacity(content.len() / 2);
|
||||
let mut current_tokens = 0;
|
||||
|
||||
// Process content by paragraph to maintain context
|
||||
for paragraph in content.split("\n\n") {
|
||||
let tokens = tokenizer.encode_with_special_tokens(paragraph).len();
|
||||
|
||||
// Check if adding paragraph exceeds limit
|
||||
if current_tokens + tokens > max_tokens {
|
||||
break;
|
||||
}
|
||||
|
||||
result.push_str(paragraph);
|
||||
result.push_str("\n\n");
|
||||
current_tokens += tokens;
|
||||
}
|
||||
|
||||
// Ensure we return valid content
|
||||
if result.is_empty() {
|
||||
return Err(AppError::Processing("Content exceeds token limit".into()));
|
||||
}
|
||||
|
||||
Ok(result.trim_end().to_string())
|
||||
}
|
||||
|
||||
/// Extracts text from a file based on its MIME type.
|
||||
async fn extract_text_from_file(file_info: &FileInfo) -> Result<String, AppError> {
|
||||
match file_info.mime_type.as_str() {
|
||||
"text/plain" => {
|
||||
// Read the file and return its content
|
||||
let content = tokio::fs::read_to_string(&file_info.path).await?;
|
||||
Ok(content)
|
||||
}
|
||||
"text/markdown" => {
|
||||
// Read the file and return its content
|
||||
let content = tokio::fs::read_to_string(&file_info.path).await?;
|
||||
Ok(content)
|
||||
}
|
||||
"application/pdf" => {
|
||||
// TODO: Implement PDF text extraction using a crate like `pdf-extract` or `lopdf`
|
||||
Err(AppError::NotFound(file_info.mime_type.clone()))
|
||||
}
|
||||
"image/png" | "image/jpeg" => {
|
||||
// TODO: Implement OCR on image using a crate like `tesseract`
|
||||
Err(AppError::NotFound(file_info.mime_type.clone()))
|
||||
}
|
||||
"application/octet-stream" => {
|
||||
let content = tokio::fs::read_to_string(&file_info.path).await?;
|
||||
Ok(content)
|
||||
}
|
||||
"text/x-rust" => {
|
||||
let content = tokio::fs::read_to_string(&file_info.path).await?;
|
||||
Ok(content)
|
||||
}
|
||||
// Handle other MIME types as needed
|
||||
_ => Err(AppError::NotFound(file_info.mime_type.clone())),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,3 +1,2 @@
|
||||
pub mod analysis;
|
||||
pub mod content_processor;
|
||||
pub mod ingress_object;
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use crate::error::AppError;
|
||||
|
||||
use super::types::{analytics::Analytics, job::Job, system_settings::SystemSettings, StoredObject};
|
||||
use super::types::{analytics::Analytics, system_settings::SystemSettings, StoredObject};
|
||||
use axum_session::{SessionConfig, SessionError, SessionStore};
|
||||
use axum_session_surreal::SessionSurrealPool;
|
||||
use futures::Stream;
|
||||
@@ -171,9 +171,9 @@ impl SurrealDbClient {
|
||||
/// * `Result<Option<T>, Error>` - The deleted item or Error
|
||||
pub async fn listen<T>(
|
||||
&self,
|
||||
) -> Result<impl Stream<Item = Result<Notification<Job>, Error>>, Error>
|
||||
) -> Result<impl Stream<Item = Result<Notification<T>, Error>>, Error>
|
||||
where
|
||||
T: for<'de> StoredObject,
|
||||
T: for<'de> StoredObject + std::marker::Unpin,
|
||||
{
|
||||
self.client.select(T::table_name()).live().await
|
||||
}
|
||||
|
||||
95
crates/common/src/storage/types/ingestion_payload.rs
Normal file
95
crates/common/src/storage/types/ingestion_payload.rs
Normal file
@@ -0,0 +1,95 @@
|
||||
use crate::{error::AppError, storage::types::file_info::FileInfo};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tracing::info;
|
||||
use url::Url;
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize, Clone)]
|
||||
pub enum IngestionPayload {
|
||||
Url {
|
||||
url: String,
|
||||
instructions: String,
|
||||
category: String,
|
||||
user_id: String,
|
||||
},
|
||||
Text {
|
||||
text: String,
|
||||
instructions: String,
|
||||
category: String,
|
||||
user_id: String,
|
||||
},
|
||||
File {
|
||||
file_info: FileInfo,
|
||||
instructions: String,
|
||||
category: String,
|
||||
user_id: String,
|
||||
},
|
||||
}
|
||||
|
||||
impl IngestionPayload {
|
||||
/// Creates ingestion payloads from the provided content, instructions, and files.
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `content` - Optional textual content to be ingressed
|
||||
/// * `instructions` - Instructions for processing the ingress content
|
||||
/// * `category` - Category to classify the ingressed content
|
||||
/// * `files` - Vector of `FileInfo` objects containing information about uploaded files
|
||||
/// * `user_id` - Identifier of the user performing the ingress operation
|
||||
///
|
||||
/// # Returns
|
||||
/// * `Result<Vec<IngestionPayload>, AppError>` - On success, returns a vector of ingress objects
|
||||
/// (one per file/content type). On failure, returns an `AppError`.
|
||||
pub fn create_ingestion_payload(
|
||||
content: Option<String>,
|
||||
instructions: String,
|
||||
category: String,
|
||||
files: Vec<FileInfo>,
|
||||
user_id: &str,
|
||||
) -> Result<Vec<IngestionPayload>, AppError> {
|
||||
// Initialize list
|
||||
let mut object_list = Vec::new();
|
||||
|
||||
// Create a IngestionPayload from content if it exists, checking for URL or text
|
||||
if let Some(input_content) = content {
|
||||
match Url::parse(&input_content) {
|
||||
Ok(url) => {
|
||||
info!("Detected URL: {}", url);
|
||||
object_list.push(IngestionPayload::Url {
|
||||
url: url.to_string(),
|
||||
instructions: instructions.clone(),
|
||||
category: category.clone(),
|
||||
user_id: user_id.into(),
|
||||
});
|
||||
}
|
||||
Err(_) => {
|
||||
if input_content.len() > 2 {
|
||||
info!("Treating input as plain text");
|
||||
object_list.push(IngestionPayload::Text {
|
||||
text: input_content.to_string(),
|
||||
instructions: instructions.clone(),
|
||||
category: category.clone(),
|
||||
user_id: user_id.into(),
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for file in files {
|
||||
object_list.push(IngestionPayload::File {
|
||||
file_info: file,
|
||||
instructions: instructions.clone(),
|
||||
category: category.clone(),
|
||||
user_id: user_id.into(),
|
||||
})
|
||||
}
|
||||
|
||||
// If no objects are constructed, we return Err
|
||||
if object_list.is_empty() {
|
||||
return Err(AppError::NotFound(
|
||||
"No valid content or files provided".into(),
|
||||
));
|
||||
}
|
||||
|
||||
Ok(object_list)
|
||||
}
|
||||
}
|
||||
@@ -2,13 +2,12 @@ use futures::Stream;
|
||||
use surrealdb::{opt::PatchOp, Notification};
|
||||
use uuid::Uuid;
|
||||
|
||||
use crate::{
|
||||
error::AppError, ingress::ingress_object::IngressObject, storage::db::SurrealDbClient,
|
||||
stored_object,
|
||||
};
|
||||
use crate::{error::AppError, storage::db::SurrealDbClient, stored_object};
|
||||
|
||||
use super::ingestion_payload::IngestionPayload;
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub enum JobStatus {
|
||||
pub enum IngestionTaskStatus {
|
||||
Created,
|
||||
InProgress {
|
||||
attempts: u32,
|
||||
@@ -19,22 +18,22 @@ pub enum JobStatus {
|
||||
Cancelled,
|
||||
}
|
||||
|
||||
stored_object!(Job, "job", {
|
||||
content: IngressObject,
|
||||
status: JobStatus,
|
||||
stored_object!(IngestionTask, "job", {
|
||||
content: IngestionPayload,
|
||||
status: IngestionTaskStatus,
|
||||
user_id: String
|
||||
});
|
||||
|
||||
pub const MAX_ATTEMPTS: u32 = 3;
|
||||
|
||||
impl Job {
|
||||
pub async fn new(content: IngressObject, user_id: String) -> Self {
|
||||
impl IngestionTask {
|
||||
pub async fn new(content: IngestionPayload, user_id: String) -> Self {
|
||||
let now = Utc::now();
|
||||
|
||||
Self {
|
||||
id: Uuid::new_v4().to_string(),
|
||||
content,
|
||||
status: JobStatus::Created,
|
||||
status: IngestionTaskStatus::Created,
|
||||
created_at: now,
|
||||
updated_at: now,
|
||||
user_id,
|
||||
@@ -43,7 +42,7 @@ impl Job {
|
||||
|
||||
/// Creates a new job and stores it in the database
|
||||
pub async fn create_and_add_to_db(
|
||||
content: IngressObject,
|
||||
content: IngestionPayload,
|
||||
user_id: String,
|
||||
db: &SurrealDbClient,
|
||||
) -> Result<(), AppError> {
|
||||
@@ -57,10 +56,10 @@ impl Job {
|
||||
// Update job status
|
||||
pub async fn update_status(
|
||||
id: &str,
|
||||
status: JobStatus,
|
||||
status: IngestionTaskStatus,
|
||||
db: &SurrealDbClient,
|
||||
) -> Result<(), AppError> {
|
||||
let _job: Option<Job> = db
|
||||
let _job: Option<Self> = db
|
||||
.update((Self::table_name(), id))
|
||||
.patch(PatchOp::replace("/status", status))
|
||||
.patch(PatchOp::replace(
|
||||
@@ -73,16 +72,16 @@ impl Job {
|
||||
}
|
||||
|
||||
/// Listen for new jobs
|
||||
pub async fn listen_for_jobs(
|
||||
pub async fn listen_for_tasks(
|
||||
db: &SurrealDbClient,
|
||||
) -> Result<impl Stream<Item = Result<Notification<Job>, surrealdb::Error>>, surrealdb::Error>
|
||||
) -> Result<impl Stream<Item = Result<Notification<Self>, surrealdb::Error>>, surrealdb::Error>
|
||||
{
|
||||
db.listen::<Job>().await
|
||||
db.listen::<Self>().await
|
||||
}
|
||||
|
||||
/// Get all unfinished jobs, ie newly created and in progress up two times
|
||||
pub async fn get_unfinished_jobs(db: &SurrealDbClient) -> Result<Vec<Job>, AppError> {
|
||||
let jobs: Vec<Job> = db
|
||||
/// Get all unfinished tasks, ie newly created and in progress up two times
|
||||
pub async fn get_unfinished_tasks(db: &SurrealDbClient) -> Result<Vec<Self>, AppError> {
|
||||
let jobs: Vec<Self> = db
|
||||
.query(
|
||||
"SELECT * FROM type::table($table)
|
||||
WHERE
|
||||
@@ -3,7 +3,8 @@ use serde::{Deserialize, Serialize};
|
||||
pub mod analytics;
|
||||
pub mod conversation;
|
||||
pub mod file_info;
|
||||
pub mod job;
|
||||
pub mod ingestion_payload;
|
||||
pub mod ingestion_task;
|
||||
pub mod knowledge_entity;
|
||||
pub mod knowledge_relationship;
|
||||
pub mod message;
|
||||
|
||||
@@ -4,7 +4,7 @@ use surrealdb::{engine::any::Any, Surreal};
|
||||
use uuid::Uuid;
|
||||
|
||||
use super::{
|
||||
conversation::Conversation, job::Job, knowledge_entity::KnowledgeEntity,
|
||||
conversation::Conversation, ingestion_task::IngestionTask, knowledge_entity::KnowledgeEntity,
|
||||
knowledge_relationship::KnowledgeRelationship, system_settings::SystemSettings,
|
||||
text_content::TextContent,
|
||||
};
|
||||
@@ -351,12 +351,12 @@ impl User {
|
||||
Ok(conversations)
|
||||
}
|
||||
|
||||
/// Gets all active jobs for the specified user
|
||||
pub async fn get_unfinished_jobs(
|
||||
/// Gets all active ingestion tasks for the specified user
|
||||
pub async fn get_unfinished_ingestion_tasks(
|
||||
user_id: &str,
|
||||
db: &SurrealDbClient,
|
||||
) -> Result<Vec<Job>, AppError> {
|
||||
let jobs: Vec<Job> = db
|
||||
) -> Result<Vec<IngestionTask>, AppError> {
|
||||
let jobs: Vec<IngestionTask> = db
|
||||
.query(
|
||||
"SELECT * FROM type::table($table)
|
||||
WHERE user_id = $user_id
|
||||
@@ -369,7 +369,7 @@ impl User {
|
||||
)
|
||||
ORDER BY created_at DESC",
|
||||
)
|
||||
.bind(("table", Job::table_name()))
|
||||
.bind(("table", IngestionTask::table_name()))
|
||||
.bind(("user_id", user_id.to_owned()))
|
||||
.bind(("max_attempts", 3))
|
||||
.await?
|
||||
@@ -384,12 +384,12 @@ impl User {
|
||||
user_id: &str,
|
||||
db: &SurrealDbClient,
|
||||
) -> Result<(), AppError> {
|
||||
db.get_item::<Job>(id)
|
||||
db.get_item::<IngestionTask>(id)
|
||||
.await?
|
||||
.filter(|job| job.user_id == user_id)
|
||||
.ok_or_else(|| AppError::Auth("Not authorized to delete this job".into()))?;
|
||||
|
||||
db.delete_item::<Job>(id)
|
||||
db.delete_item::<IngestionTask>(id)
|
||||
.await
|
||||
.map_err(AppError::Database)?;
|
||||
|
||||
|
||||
Reference in New Issue
Block a user