mirror of
https://github.com/perstarkse/minne.git
synced 2026-01-11 20:50:24 +01:00
chore: clippy ingestion-pipeline
This commit is contained in:
@@ -25,7 +25,7 @@ pub struct IngestionEnricher {
|
||||
}
|
||||
|
||||
impl IngestionEnricher {
|
||||
pub fn new(
|
||||
pub const fn new(
|
||||
db_client: Arc<SurrealDbClient>,
|
||||
openai_client: Arc<async_openai::Client<async_openai::config::OpenAIConfig>>,
|
||||
) -> Self {
|
||||
@@ -61,8 +61,7 @@ impl IngestionEnricher {
|
||||
user_id: &str,
|
||||
) -> Result<Vec<RetrievedEntity>, AppError> {
|
||||
let input_text = format!(
|
||||
"content: {}, category: {}, user_context: {:?}",
|
||||
text, category, context
|
||||
"content: {text}, category: {category}, user_context: {context:?}"
|
||||
);
|
||||
|
||||
retrieve_entities(&self.db_client, &self.openai_client, &input_text, user_id).await
|
||||
@@ -80,8 +79,7 @@ impl IngestionEnricher {
|
||||
let entities_json = format_entities_json(similar_entities);
|
||||
|
||||
let user_message = format!(
|
||||
"Category:\n{}\ncontext:\n{:?}\nContent:\n{}\nExisting KnowledgeEntities in database:\n{}",
|
||||
category, context, text, entities_json
|
||||
"Category:\n{category}\ncontext:\n{context:?}\nContent:\n{text}\nExisting KnowledgeEntities in database:\n{entities_json}"
|
||||
);
|
||||
|
||||
debug!("Prepared LLM request message: {}", user_message);
|
||||
@@ -122,7 +120,7 @@ impl IngestionEnricher {
|
||||
))?;
|
||||
|
||||
serde_json::from_str::<LLMEnrichmentResult>(content).map_err(|e| {
|
||||
AppError::LLMParsing(format!("Failed to parse LLM response into analysis: {}", e))
|
||||
AppError::LLMParsing(format!("Failed to parse LLM response into analysis: {e}"))
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -68,7 +68,7 @@ impl IngestionPipeline {
|
||||
.await?;
|
||||
|
||||
match self.process(&text_content).await {
|
||||
Ok(_) => {
|
||||
Ok(()) => {
|
||||
processing_task.mark_succeeded(&self.db).await?;
|
||||
info!(%task_id, attempt, "ingestion task succeeded");
|
||||
Ok(())
|
||||
@@ -173,7 +173,7 @@ impl IngestionPipeline {
|
||||
let entity_count = entities.len();
|
||||
let relationship_count = relationships.len();
|
||||
|
||||
const STORE_GRAPH_MUTATION: &str = r#"
|
||||
const STORE_GRAPH_MUTATION: &str = r"
|
||||
BEGIN TRANSACTION;
|
||||
LET $entities = $entities;
|
||||
LET $relationships = $relationships;
|
||||
@@ -192,7 +192,7 @@ impl IngestionPipeline {
|
||||
};
|
||||
|
||||
COMMIT TRANSACTION;
|
||||
"#;
|
||||
";
|
||||
|
||||
const MAX_ATTEMPTS: usize = 3;
|
||||
const INITIAL_BACKOFF_MS: u64 = 50;
|
||||
|
||||
@@ -49,7 +49,7 @@ impl LLMEnrichmentResult {
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `source_id` - A UUID representing the source identifier.
|
||||
/// * `openai_client` - OpenAI client for LLM calls.
|
||||
/// * `openai_client` - `OpenAI` client for LLM calls.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
|
||||
@@ -4,7 +4,7 @@ use common::{
|
||||
storage::{db::SurrealDbClient, types::system_settings::SystemSettings},
|
||||
};
|
||||
|
||||
/// Transcribes an audio file using the configured OpenAI Whisper model.
|
||||
/// Transcribes an audio file using the configured `OpenAI` Whisper model.
|
||||
pub async fn transcribe_audio_file(
|
||||
file_path: &str,
|
||||
db_client: &SurrealDbClient,
|
||||
@@ -23,6 +23,6 @@ pub async fn transcribe_audio_file(
|
||||
.audio()
|
||||
.transcribe(request)
|
||||
.await
|
||||
.map_err(|e| AppError::Processing(format!("Audio transcription failed: {}", e)))?;
|
||||
.map_err(|e| AppError::Processing(format!("Audio transcription failed: {e}")))?;
|
||||
Ok(response.text)
|
||||
}
|
||||
|
||||
@@ -19,7 +19,7 @@ pub async fn extract_text_from_image(
|
||||
|
||||
let base64_image = STANDARD.encode(&image_bytes);
|
||||
|
||||
let image_url = format!("data:image/png;base64,{}", base64_image);
|
||||
let image_url = format!("data:image/png;base64,{base64_image}");
|
||||
|
||||
let request = CreateChatCompletionRequestArgs::default()
|
||||
.model(system_settings.image_processing_model)
|
||||
|
||||
@@ -15,13 +15,13 @@ pub struct GraphMapper {
|
||||
|
||||
impl Default for GraphMapper {
|
||||
fn default() -> Self {
|
||||
GraphMapper::new()
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
impl GraphMapper {
|
||||
pub fn new() -> Self {
|
||||
GraphMapper {
|
||||
Self {
|
||||
key_to_id: HashMap::new(),
|
||||
}
|
||||
}
|
||||
@@ -36,8 +36,7 @@ impl GraphMapper {
|
||||
// If parsing fails, look it up in the map.
|
||||
self.key_to_id.get(key).copied().ok_or_else(|| {
|
||||
AppError::GraphMapper(format!(
|
||||
"Key '{}' is not a valid UUID and was not found in the map.",
|
||||
key
|
||||
"Key '{key}' is not a valid UUID and was not found in the map."
|
||||
))
|
||||
})
|
||||
}
|
||||
@@ -54,6 +53,6 @@ impl GraphMapper {
|
||||
self.key_to_id
|
||||
.get(key)
|
||||
.copied()
|
||||
.ok_or_else(|| AppError::GraphMapper(format!("Key '{}' not found in map.", key)))
|
||||
.ok_or_else(|| AppError::GraphMapper(format!("Key '{key}' not found in map.")))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -118,7 +118,7 @@ async fn load_page_numbers(pdf_bytes: Vec<u8>) -> Result<Vec<u32>, AppError> {
|
||||
/// Uses the existing headless Chrome dependency to rasterize the requested PDF pages into PNGs.
|
||||
async fn render_pdf_pages(file_path: &Path, pages: &[u32]) -> Result<Vec<Vec<u8>>, AppError> {
|
||||
let file_url = url::Url::from_file_path(file_path)
|
||||
.map_err(|_| AppError::Processing("Unable to construct PDF file URL".into()))?;
|
||||
.map_err(|()| AppError::Processing("Unable to construct PDF file URL".into()))?;
|
||||
|
||||
let browser = create_browser()?;
|
||||
let tab = browser
|
||||
@@ -133,8 +133,7 @@ async fn render_pdf_pages(file_path: &Path, pages: &[u32]) -> Result<Vec<Vec<u8>
|
||||
|
||||
for (idx, page) in pages.iter().enumerate() {
|
||||
let target = format!(
|
||||
"{}#page={}&toolbar=0&statusbar=0&zoom=page-fit",
|
||||
file_url, page
|
||||
"{file_url}#page={page}&toolbar=0&statusbar=0&zoom=page-fit"
|
||||
);
|
||||
tab.navigate_to(&target)
|
||||
.map_err(|err| AppError::Processing(format!("Failed to navigate to PDF page: {err}")))?
|
||||
@@ -279,7 +278,7 @@ async fn vision_markdown(
|
||||
let mut markdown_sections = Vec::with_capacity(rendered_pages.len());
|
||||
|
||||
for (batch_idx, chunk) in rendered_pages.chunks(PAGES_PER_VISION_CHUNK).enumerate() {
|
||||
let total_image_bytes: usize = chunk.iter().map(|bytes| bytes.len()).sum();
|
||||
let total_image_bytes: usize = chunk.iter().map(std::vec::Vec::len).sum();
|
||||
debug!(
|
||||
batch = batch_idx,
|
||||
pages = chunk.len(),
|
||||
@@ -318,7 +317,7 @@ async fn vision_markdown(
|
||||
);
|
||||
|
||||
for encoded in &encoded_images {
|
||||
let image_url = format!("data:image/png;base64,{}", encoded);
|
||||
let image_url = format!("data:image/png;base64,{encoded}");
|
||||
content_parts.push(
|
||||
ChatCompletionRequestMessageContentPartImageArgs::default()
|
||||
.image_url(
|
||||
@@ -413,7 +412,7 @@ fn looks_good_enough(text: &str) -> bool {
|
||||
return false;
|
||||
}
|
||||
|
||||
let ascii_chars = text.chars().filter(|c| c.is_ascii()).count() as f64;
|
||||
let ascii_chars = text.chars().filter(char::is_ascii).count() as f64;
|
||||
let ascii_ratio = ascii_chars / total_chars;
|
||||
if ascii_ratio < FAST_PATH_MIN_ASCII_RATIO {
|
||||
return false;
|
||||
@@ -484,8 +483,7 @@ fn is_structural_line(line: &str) -> bool {
|
||||
|| lowered
|
||||
.chars()
|
||||
.next()
|
||||
.map(|c| c.is_ascii_digit())
|
||||
.unwrap_or(false)
|
||||
.is_some_and(|c| c.is_ascii_digit())
|
||||
&& lowered.contains('.')
|
||||
}
|
||||
|
||||
@@ -572,14 +570,13 @@ fn prepare_pdf_viewer(tab: &headless_chrome::Tab, page_number: u32) {
|
||||
const toolbar = app.shadowRoot.querySelector('#toolbar');
|
||||
if (toolbar) {{ toolbar.style.display = 'none'; }}
|
||||
}}
|
||||
const page = viewer.shadowRoot.querySelector('viewer-page:nth-of-type({page})');
|
||||
const page = viewer.shadowRoot.querySelector('viewer-page:nth-of-type({page_number})');
|
||||
if (page && page.scrollIntoView) {{
|
||||
page.scrollIntoView({{ block: 'start', inline: 'center' }});
|
||||
}}
|
||||
const canvas = viewer.shadowRoot.querySelector('canvas[aria-label="Page {page}"]');
|
||||
const canvas = viewer.shadowRoot.querySelector('canvas[aria-label="Page {page_number}"]');
|
||||
return !!canvas;
|
||||
}})()"#,
|
||||
page = page_number
|
||||
}})()"#
|
||||
);
|
||||
|
||||
match tab.evaluate(&script, false) {
|
||||
@@ -607,12 +604,11 @@ fn canvas_viewport_for_page(
|
||||
if (!embed || !embed.shadowRoot) return null;
|
||||
const viewer = embed.shadowRoot.querySelector('pdf-viewer');
|
||||
if (!viewer || !viewer.shadowRoot) return null;
|
||||
const canvas = viewer.shadowRoot.querySelector('canvas[aria-label="Page {page}"]');
|
||||
const canvas = viewer.shadowRoot.querySelector('canvas[aria-label="Page {page_number}"]');
|
||||
if (!canvas) return null;
|
||||
const rect = canvas.getBoundingClientRect();
|
||||
return {{ x: rect.x, y: rect.y, width: rect.width, height: rect.height }};
|
||||
}})()"#,
|
||||
page = page_number
|
||||
}})()"#
|
||||
);
|
||||
|
||||
let result = tab
|
||||
@@ -683,7 +679,7 @@ fn capture_full_page_png(tab: &headless_chrome::Tab) -> Result<Vec<u8>, AppError
|
||||
})
|
||||
}
|
||||
|
||||
fn is_suspicious_image(len: usize) -> bool {
|
||||
const fn is_suspicious_image(len: usize) -> bool {
|
||||
len < MIN_PAGE_IMAGE_BYTES
|
||||
}
|
||||
|
||||
@@ -710,7 +706,7 @@ fn is_low_quality_response(content: &str) -> bool {
|
||||
lowered.contains("unable to") || lowered.contains("cannot")
|
||||
}
|
||||
|
||||
fn prompt_for_attempt(attempt: usize, base_prompt: &str) -> &str {
|
||||
const fn prompt_for_attempt(attempt: usize, base_prompt: &str) -> &str {
|
||||
if attempt == 0 {
|
||||
base_prompt
|
||||
} else {
|
||||
|
||||
Reference in New Issue
Block a user