From e07199adfc33cc497928b8f0c8ad70923aaba797 Mon Sep 17 00:00:00 2001 From: Per Stark Date: Fri, 13 Feb 2026 22:36:00 +0100 Subject: [PATCH] fix: name harmonization of endpoints & ingestion security hardening --- CHANGELOG.md | 3 +- api-router/src/error.rs | 14 +++ api-router/src/lib.rs | 10 +- .../src/routes/{ingress.rs => ingest.rs} | 21 +++- api-router/src/routes/mod.rs | 2 +- common/src/utils/config.rs | 35 ++++++ common/src/utils/ingest_limits.rs | 113 ++++++++++++++++++ common/src/utils/mod.rs | 1 + docs/configuration.md | 12 ++ html-router/src/lib.rs | 4 +- html-router/src/routes/ingestion/handlers.rs | 74 +++++++----- html-router/src/routes/ingestion/mod.rs | 16 +-- html-router/templates/dashboard/_base.html | 2 +- html-router/templates/ingestion_modal.html | 2 +- html-router/templates/sidebar.html | 2 +- 15 files changed, 258 insertions(+), 53 deletions(-) rename api-router/src/routes/{ingress.rs => ingest.rs} (77%) create mode 100644 common/src/utils/ingest_limits.rs diff --git a/CHANGELOG.md b/CHANGELOG.md index e20cccb..7833b78 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,8 +1,7 @@ # Changelog ## Unreleased - Fix: edge case where navigation back to a chat page could trigger a new response generation -- Security: hardened storage-layer queries by replacing user-influenced string interpolation with bound parameters and adding injection regression tests. -- Security: removed raw ingestion payload logging from API/HTML ingress handlers and replaced it with metadata-only structured logs. +- Security: Misc security fixes ## 1.0.1 (2026-02-11) - Shipped an S3 storage backend so content can be stored in object storage instead of local disk, with configuration support for S3 deployments. diff --git a/api-router/src/error.rs b/api-router/src/error.rs index bb2672b..f127752 100644 --- a/api-router/src/error.rs +++ b/api-router/src/error.rs @@ -20,6 +20,9 @@ pub enum ApiError { #[error("Unauthorized: {0}")] Unauthorized(String), + + #[error("Payload too large: {0}")] + PayloadTooLarge(String), } impl From for ApiError { @@ -67,6 +70,13 @@ impl IntoResponse for ApiError { status: "error".to_string(), }, ), + Self::PayloadTooLarge(message) => ( + StatusCode::PAYLOAD_TOO_LARGE, + ErrorResponse { + error: message, + status: "error".to_string(), + }, + ), }; (status, Json(error_response)).into_response() @@ -132,6 +142,10 @@ mod tests { // Test unauthorized status let error = ApiError::Unauthorized("not allowed".to_string()); assert_status_code(error, StatusCode::UNAUTHORIZED); + + // Test payload too large status + let error = ApiError::PayloadTooLarge("too big".to_string()); + assert_status_code(error, StatusCode::PAYLOAD_TOO_LARGE); } // Alternative approach that doesn't try to parse the response body diff --git a/api-router/src/lib.rs b/api-router/src/lib.rs index f4834bd..b133814 100644 --- a/api-router/src/lib.rs +++ b/api-router/src/lib.rs @@ -6,7 +6,7 @@ use axum::{ Router, }; use middleware_api_auth::api_auth; -use routes::{categories::get_categories, ingress::ingest_data, liveness::live, readiness::ready}; +use routes::{categories::get_categories, ingest::ingest_data, liveness::live, readiness::ready}; pub mod api_state; pub mod error; @@ -26,9 +26,13 @@ where // Protected API endpoints (require auth) let protected = Router::new() - .route("/ingress", post(ingest_data)) + .route( + "/ingest", + post(ingest_data).layer(DefaultBodyLimit::max( + app_state.config.ingest_max_body_bytes, + )), + ) .route("/categories", get(get_categories)) - .layer(DefaultBodyLimit::max(1024 * 1024 * 1024)) .route_layer(from_fn_with_state(app_state.clone(), api_auth)); public.merge(protected) diff --git a/api-router/src/routes/ingress.rs b/api-router/src/routes/ingest.rs similarity index 77% rename from api-router/src/routes/ingress.rs rename to api-router/src/routes/ingest.rs index b203e73..8587dbb 100644 --- a/api-router/src/routes/ingress.rs +++ b/api-router/src/routes/ingest.rs @@ -6,6 +6,7 @@ use common::{ file_info::FileInfo, ingestion_payload::IngestionPayload, ingestion_task::IngestionTask, user::User, }, + utils::ingest_limits::{validate_ingest_input, IngestValidationError}, }; use futures::{future::try_join_all, TryFutureExt}; use serde_json::json; @@ -19,7 +20,7 @@ pub struct IngestParams { pub content: Option, pub context: String, pub category: String, - #[form_data(limit = "10000000")] // Adjust limit as needed + #[form_data(limit = "20000000")] #[form_data(default)] pub files: Vec>, } @@ -36,6 +37,22 @@ pub async fn ingest_data( let category_bytes = input.category.len(); let file_count = input.files.len(); + match validate_ingest_input( + &state.config, + input.content.as_deref(), + &input.context, + &input.category, + file_count, + ) { + Ok(()) => {} + Err(IngestValidationError::PayloadTooLarge(message)) => { + return Err(ApiError::PayloadTooLarge(message)); + } + Err(IngestValidationError::BadRequest(message)) => { + return Err(ApiError::ValidationError(message)); + } + } + info!( user_id = %user_id, has_content, @@ -43,7 +60,7 @@ pub async fn ingest_data( context_bytes, category_bytes, file_count, - "Received ingestion request" + "Received ingest request" ); let file_infos = try_join_all(input.files.into_iter().map(|file| { diff --git a/api-router/src/routes/mod.rs b/api-router/src/routes/mod.rs index 03821c4..33f95bd 100644 --- a/api-router/src/routes/mod.rs +++ b/api-router/src/routes/mod.rs @@ -1,4 +1,4 @@ pub mod categories; -pub mod ingress; +pub mod ingest; pub mod liveness; pub mod readiness; diff --git a/common/src/utils/config.rs b/common/src/utils/config.rs index b46b53b..26b761c 100644 --- a/common/src/utils/config.rs +++ b/common/src/utils/config.rs @@ -86,6 +86,16 @@ pub struct AppConfig { pub retrieval_strategy: Option, #[serde(default)] pub embedding_backend: EmbeddingBackend, + #[serde(default = "default_ingest_max_body_bytes")] + pub ingest_max_body_bytes: usize, + #[serde(default = "default_ingest_max_files")] + pub ingest_max_files: usize, + #[serde(default = "default_ingest_max_content_bytes")] + pub ingest_max_content_bytes: usize, + #[serde(default = "default_ingest_max_context_bytes")] + pub ingest_max_context_bytes: usize, + #[serde(default = "default_ingest_max_category_bytes")] + pub ingest_max_category_bytes: usize, } /// Default data directory for persisted assets. @@ -103,6 +113,26 @@ fn default_reranking_enabled() -> bool { false } +fn default_ingest_max_body_bytes() -> usize { + 20_000_000 +} + +fn default_ingest_max_files() -> usize { + 5 +} + +fn default_ingest_max_content_bytes() -> usize { + 262_144 +} + +fn default_ingest_max_context_bytes() -> usize { + 16_384 +} + +fn default_ingest_max_category_bytes() -> usize { + 128 +} + pub fn ensure_ort_path() { if env::var_os("ORT_DYLIB_PATH").is_some() { return; @@ -157,6 +187,11 @@ impl Default for AppConfig { fastembed_max_length: None, retrieval_strategy: None, embedding_backend: EmbeddingBackend::default(), + ingest_max_body_bytes: default_ingest_max_body_bytes(), + ingest_max_files: default_ingest_max_files(), + ingest_max_content_bytes: default_ingest_max_content_bytes(), + ingest_max_context_bytes: default_ingest_max_context_bytes(), + ingest_max_category_bytes: default_ingest_max_category_bytes(), } } } diff --git a/common/src/utils/ingest_limits.rs b/common/src/utils/ingest_limits.rs new file mode 100644 index 0000000..3f74ec8 --- /dev/null +++ b/common/src/utils/ingest_limits.rs @@ -0,0 +1,113 @@ +use super::config::AppConfig; + +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum IngestValidationError { + PayloadTooLarge(String), + BadRequest(String), +} + +pub fn validate_ingest_input( + config: &AppConfig, + content: Option<&str>, + context: &str, + category: &str, + file_count: usize, +) -> Result<(), IngestValidationError> { + if file_count > config.ingest_max_files { + return Err(IngestValidationError::BadRequest(format!( + "Too many files. Maximum allowed is {}", + config.ingest_max_files + ))); + } + + if let Some(content) = content { + if content.len() > config.ingest_max_content_bytes { + return Err(IngestValidationError::PayloadTooLarge(format!( + "Content is too large. Maximum allowed is {} bytes", + config.ingest_max_content_bytes + ))); + } + } + + if context.len() > config.ingest_max_context_bytes { + return Err(IngestValidationError::PayloadTooLarge(format!( + "Context is too large. Maximum allowed is {} bytes", + config.ingest_max_context_bytes + ))); + } + + if category.len() > config.ingest_max_category_bytes { + return Err(IngestValidationError::PayloadTooLarge(format!( + "Category is too large. Maximum allowed is {} bytes", + config.ingest_max_category_bytes + ))); + } + + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn validate_ingest_input_rejects_too_many_files() { + let config = AppConfig { + ingest_max_files: 1, + ..Default::default() + }; + let result = validate_ingest_input(&config, Some("ok"), "ctx", "cat", 2); + + assert!(matches!(result, Err(IngestValidationError::BadRequest(_)))); + } + + #[test] + fn validate_ingest_input_rejects_oversized_content() { + let config = AppConfig { + ingest_max_content_bytes: 4, + ..Default::default() + }; + let result = validate_ingest_input(&config, Some("12345"), "ctx", "cat", 0); + + assert!(matches!( + result, + Err(IngestValidationError::PayloadTooLarge(_)) + )); + } + + #[test] + fn validate_ingest_input_rejects_oversized_context() { + let config = AppConfig { + ingest_max_context_bytes: 2, + ..Default::default() + }; + let result = validate_ingest_input(&config, None, "long", "cat", 0); + + assert!(matches!( + result, + Err(IngestValidationError::PayloadTooLarge(_)) + )); + } + + #[test] + fn validate_ingest_input_rejects_oversized_category() { + let config = AppConfig { + ingest_max_category_bytes: 2, + ..Default::default() + }; + let result = validate_ingest_input(&config, None, "ok", "long", 0); + + assert!(matches!( + result, + Err(IngestValidationError::PayloadTooLarge(_)) + )); + } + + #[test] + fn validate_ingest_input_accepts_valid_payload() { + let config = AppConfig::default(); + let result = validate_ingest_input(&config, Some("ok"), "ctx", "cat", 1); + + assert!(result.is_ok()); + } +} diff --git a/common/src/utils/mod.rs b/common/src/utils/mod.rs index 10b3f33..2801c3f 100644 --- a/common/src/utils/mod.rs +++ b/common/src/utils/mod.rs @@ -1,3 +1,4 @@ pub mod config; pub mod embedding; +pub mod ingest_limits; pub mod template_engine; diff --git a/docs/configuration.md b/docs/configuration.md index 58174f7..2018469 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -29,6 +29,11 @@ Minne can be configured via environment variables or a `config.yaml` file. Envir | `FASTEMBED_CACHE_DIR` | Model cache directory | `/fastembed` | | `FASTEMBED_SHOW_DOWNLOAD_PROGRESS` | Show progress bar for model downloads | `false` | | `FASTEMBED_MAX_LENGTH` | Max sequence length for FastEmbed models | - | +| `INGEST_MAX_BODY_BYTES` | Max request body size for ingest endpoints | `20000000` | +| `INGEST_MAX_FILES` | Max files allowed per ingest request | `5` | +| `INGEST_MAX_CONTENT_BYTES` | Max `content` field size for ingest requests | `262144` | +| `INGEST_MAX_CONTEXT_BYTES` | Max `context` field size for ingest requests | `16384` | +| `INGEST_MAX_CATEGORY_BYTES` | Max `category` field size for ingest requests | `128` | ### S3 Storage (Optional) @@ -76,6 +81,13 @@ embedding_backend: "fastembed" # Optional reranking reranking_enabled: true reranking_pool_size: 2 + +# Ingest safety limits +ingest_max_body_bytes: 20000000 +ingest_max_files: 5 +ingest_max_content_bytes: 262144 +ingest_max_context_bytes: 16384 +ingest_max_category_bytes: 128 ``` ## AI Provider Setup diff --git a/html-router/src/lib.rs b/html-router/src/lib.rs index f233832..b8ea15e 100644 --- a/html-router/src/lib.rs +++ b/html-router/src/lib.rs @@ -35,7 +35,9 @@ where .add_protected_routes(routes::chat::router()) .add_protected_routes(routes::content::router()) .add_protected_routes(routes::knowledge::router()) - .add_protected_routes(routes::ingestion::router()) + .add_protected_routes(routes::ingestion::router( + app_state.config.ingest_max_body_bytes, + )) .add_protected_routes(routes::scratchpad::router()) .with_compression() .build() diff --git a/html-router/src/routes/ingestion/handlers.rs b/html-router/src/routes/ingestion/handlers.rs index 76cc789..a037745 100644 --- a/html-router/src/routes/ingestion/handlers.rs +++ b/html-router/src/routes/ingestion/handlers.rs @@ -2,9 +2,10 @@ use std::{pin::Pin, time::Duration}; use axum::{ extract::{Query, State}, + http::StatusCode, response::{ sse::{Event, KeepAlive}, - Html, IntoResponse, Sse, + Html, IntoResponse, Response, Sse, }, }; use axum_typed_multipart::{FieldData, TryFromMultipart, TypedMultipart}; @@ -23,6 +24,7 @@ use common::{ ingestion_task::{IngestionTask, TaskState}, user::User, }, + utils::ingest_limits::{validate_ingest_input, IngestValidationError}, }; use crate::{ @@ -34,30 +36,32 @@ use crate::{ AuthSessionType, }; -pub async fn show_ingress_form( +pub async fn show_ingest_form( State(state): State, RequireUser(user): RequireUser, ) -> Result { let user_categories = User::get_user_categories(&user.id, &state.db).await?; #[derive(Serialize)] - pub struct ShowIngressFormData { + pub struct ShowIngestFormData { user_categories: Vec, } Ok(TemplateResponse::new_template( "ingestion_modal.html", - ShowIngressFormData { user_categories }, + ShowIngestFormData { user_categories }, )) } -pub async fn hide_ingress_form( +pub async fn hide_ingest_form( RequireUser(_user): RequireUser, ) -> Result { - Ok(Html( - "Add Content", + Ok( + Html( + "Add Content", + ) + .into_response(), ) - .into_response()) } #[derive(Debug, TryFromMultipart)] @@ -65,34 +69,22 @@ pub struct IngestionParams { pub content: Option, pub context: String, pub category: String, - #[form_data(limit = "10000000")] // Adjust limit as needed + #[form_data(limit = "20000000")] #[form_data(default)] pub files: Vec>, } -pub async fn process_ingress_form( +pub async fn process_ingest_form( State(state): State, RequireUser(user): RequireUser, TypedMultipart(input): TypedMultipart, -) -> Result { - #[derive(Serialize)] - pub struct IngressFormData { - context: String, - content: String, - category: String, - error: String, - } - +) -> Result { if input.content.as_ref().is_none_or(|c| c.len() < 2) && input.files.is_empty() { - return Ok(TemplateResponse::new_template( - "index/signed_in/ingress_form.html", - IngressFormData { - context: input.context.clone(), - content: input.content.clone().unwrap_or_default(), - category: input.category.clone(), - error: "You need to either add files or content".to_string(), - }, - )); + return Ok(( + StatusCode::BAD_REQUEST, + "You need to either add files or content", + ) + .into_response()); } let content_bytes = input.content.as_ref().map_or(0, |c| c.len()); @@ -101,6 +93,22 @@ pub async fn process_ingress_form( let category_bytes = input.category.len(); let file_count = input.files.len(); + match validate_ingest_input( + &state.config, + input.content.as_deref(), + &input.context, + &input.category, + file_count, + ) { + Ok(()) => {} + Err(IngestValidationError::PayloadTooLarge(message)) => { + return Ok((StatusCode::PAYLOAD_TOO_LARGE, message).into_response()); + } + Err(IngestValidationError::BadRequest(message)) => { + return Ok((StatusCode::BAD_REQUEST, message).into_response()); + } + } + info!( user_id = %user.id, has_content, @@ -108,7 +116,7 @@ pub async fn process_ingress_form( context_bytes, category_bytes, file_count, - "Received ingestion form submission" + "Received ingest form submission" ); let file_infos = try_join_all(input.files.into_iter().map(|file| { @@ -137,10 +145,10 @@ pub async fn process_ingress_form( tasks: Vec, } - Ok(TemplateResponse::new_template( - "dashboard/current_task.html", - NewTasksData { tasks }, - )) + Ok( + TemplateResponse::new_template("dashboard/current_task.html", NewTasksData { tasks }) + .into_response(), + ) } #[derive(Deserialize)] diff --git a/html-router/src/routes/ingestion/mod.rs b/html-router/src/routes/ingestion/mod.rs index fe16fc4..77ff53c 100644 --- a/html-router/src/routes/ingestion/mod.rs +++ b/html-router/src/routes/ingestion/mod.rs @@ -1,22 +1,22 @@ mod handlers; -use axum::{extract::FromRef, routing::get, Router}; -use handlers::{ - get_task_updates_stream, hide_ingress_form, process_ingress_form, show_ingress_form, -}; +use axum::{extract::DefaultBodyLimit, extract::FromRef, routing::get, Router}; +use handlers::{get_task_updates_stream, hide_ingest_form, process_ingest_form, show_ingest_form}; use crate::html_state::HtmlState; -pub fn router() -> Router +pub fn router(max_body_bytes: usize) -> Router where S: Clone + Send + Sync + 'static, HtmlState: FromRef, { Router::new() .route( - "/ingress-form", - get(show_ingress_form).post(process_ingress_form), + "/ingest-form", + get(show_ingest_form) + .post(process_ingest_form) + .layer(DefaultBodyLimit::max(max_body_bytes)), ) .route("/task/status-stream", get(get_task_updates_stream)) - .route("/hide-ingress-form", get(hide_ingress_form)) + .route("/hide-ingest-form", get(hide_ingest_form)) } diff --git a/html-router/templates/dashboard/_base.html b/html-router/templates/dashboard/_base.html index cfc6e0a..214415c 100644 --- a/html-router/templates/dashboard/_base.html +++ b/html-router/templates/dashboard/_base.html @@ -2,7 +2,7 @@ {% block dashboard_header %}

Dashboard

- diff --git a/html-router/templates/ingestion_modal.html b/html-router/templates/ingestion_modal.html index f48e9cc..1be0f25 100644 --- a/html-router/templates/ingestion_modal.html +++ b/html-router/templates/ingestion_modal.html @@ -3,7 +3,7 @@ {% block modal_class %}max-w-3xl{% endblock %} {% block form_attributes %} -hx-post="/ingress-form" +hx-post="/ingest-form" enctype="multipart/form-data" {% endblock %} diff --git a/html-router/templates/sidebar.html b/html-router/templates/sidebar.html index fa6d70b..6aad686 100644 --- a/html-router/templates/sidebar.html +++ b/html-router/templates/sidebar.html @@ -18,7 +18,7 @@ {% endfor %}
  • -