fix: name harmonization of endpoints & ingestion security hardening

This commit is contained in:
Per Stark
2026-02-13 22:36:00 +01:00
parent f22cac891c
commit e07199adfc
15 changed files with 258 additions and 53 deletions

View File

@@ -1,8 +1,7 @@
# Changelog # Changelog
## Unreleased ## Unreleased
- Fix: edge case where navigation back to a chat page could trigger a new response generation - Fix: edge case where navigation back to a chat page could trigger a new response generation
- Security: hardened storage-layer queries by replacing user-influenced string interpolation with bound parameters and adding injection regression tests. - Security: Misc security fixes
- Security: removed raw ingestion payload logging from API/HTML ingress handlers and replaced it with metadata-only structured logs.
## 1.0.1 (2026-02-11) ## 1.0.1 (2026-02-11)
- Shipped an S3 storage backend so content can be stored in object storage instead of local disk, with configuration support for S3 deployments. - Shipped an S3 storage backend so content can be stored in object storage instead of local disk, with configuration support for S3 deployments.

View File

@@ -20,6 +20,9 @@ pub enum ApiError {
#[error("Unauthorized: {0}")] #[error("Unauthorized: {0}")]
Unauthorized(String), Unauthorized(String),
#[error("Payload too large: {0}")]
PayloadTooLarge(String),
} }
impl From<AppError> for ApiError { impl From<AppError> for ApiError {
@@ -67,6 +70,13 @@ impl IntoResponse for ApiError {
status: "error".to_string(), status: "error".to_string(),
}, },
), ),
Self::PayloadTooLarge(message) => (
StatusCode::PAYLOAD_TOO_LARGE,
ErrorResponse {
error: message,
status: "error".to_string(),
},
),
}; };
(status, Json(error_response)).into_response() (status, Json(error_response)).into_response()
@@ -132,6 +142,10 @@ mod tests {
// Test unauthorized status // Test unauthorized status
let error = ApiError::Unauthorized("not allowed".to_string()); let error = ApiError::Unauthorized("not allowed".to_string());
assert_status_code(error, StatusCode::UNAUTHORIZED); assert_status_code(error, StatusCode::UNAUTHORIZED);
// Test payload too large status
let error = ApiError::PayloadTooLarge("too big".to_string());
assert_status_code(error, StatusCode::PAYLOAD_TOO_LARGE);
} }
// Alternative approach that doesn't try to parse the response body // Alternative approach that doesn't try to parse the response body

View File

@@ -6,7 +6,7 @@ use axum::{
Router, Router,
}; };
use middleware_api_auth::api_auth; use middleware_api_auth::api_auth;
use routes::{categories::get_categories, ingress::ingest_data, liveness::live, readiness::ready}; use routes::{categories::get_categories, ingest::ingest_data, liveness::live, readiness::ready};
pub mod api_state; pub mod api_state;
pub mod error; pub mod error;
@@ -26,9 +26,13 @@ where
// Protected API endpoints (require auth) // Protected API endpoints (require auth)
let protected = Router::new() let protected = Router::new()
.route("/ingress", post(ingest_data)) .route(
"/ingest",
post(ingest_data).layer(DefaultBodyLimit::max(
app_state.config.ingest_max_body_bytes,
)),
)
.route("/categories", get(get_categories)) .route("/categories", get(get_categories))
.layer(DefaultBodyLimit::max(1024 * 1024 * 1024))
.route_layer(from_fn_with_state(app_state.clone(), api_auth)); .route_layer(from_fn_with_state(app_state.clone(), api_auth));
public.merge(protected) public.merge(protected)

View File

@@ -6,6 +6,7 @@ use common::{
file_info::FileInfo, ingestion_payload::IngestionPayload, ingestion_task::IngestionTask, file_info::FileInfo, ingestion_payload::IngestionPayload, ingestion_task::IngestionTask,
user::User, user::User,
}, },
utils::ingest_limits::{validate_ingest_input, IngestValidationError},
}; };
use futures::{future::try_join_all, TryFutureExt}; use futures::{future::try_join_all, TryFutureExt};
use serde_json::json; use serde_json::json;
@@ -19,7 +20,7 @@ pub struct IngestParams {
pub content: Option<String>, pub content: Option<String>,
pub context: String, pub context: String,
pub category: String, pub category: String,
#[form_data(limit = "10000000")] // Adjust limit as needed #[form_data(limit = "20000000")]
#[form_data(default)] #[form_data(default)]
pub files: Vec<FieldData<NamedTempFile>>, pub files: Vec<FieldData<NamedTempFile>>,
} }
@@ -36,6 +37,22 @@ pub async fn ingest_data(
let category_bytes = input.category.len(); let category_bytes = input.category.len();
let file_count = input.files.len(); let file_count = input.files.len();
match validate_ingest_input(
&state.config,
input.content.as_deref(),
&input.context,
&input.category,
file_count,
) {
Ok(()) => {}
Err(IngestValidationError::PayloadTooLarge(message)) => {
return Err(ApiError::PayloadTooLarge(message));
}
Err(IngestValidationError::BadRequest(message)) => {
return Err(ApiError::ValidationError(message));
}
}
info!( info!(
user_id = %user_id, user_id = %user_id,
has_content, has_content,
@@ -43,7 +60,7 @@ pub async fn ingest_data(
context_bytes, context_bytes,
category_bytes, category_bytes,
file_count, file_count,
"Received ingestion request" "Received ingest request"
); );
let file_infos = try_join_all(input.files.into_iter().map(|file| { let file_infos = try_join_all(input.files.into_iter().map(|file| {

View File

@@ -1,4 +1,4 @@
pub mod categories; pub mod categories;
pub mod ingress; pub mod ingest;
pub mod liveness; pub mod liveness;
pub mod readiness; pub mod readiness;

View File

@@ -86,6 +86,16 @@ pub struct AppConfig {
pub retrieval_strategy: Option<String>, pub retrieval_strategy: Option<String>,
#[serde(default)] #[serde(default)]
pub embedding_backend: EmbeddingBackend, pub embedding_backend: EmbeddingBackend,
#[serde(default = "default_ingest_max_body_bytes")]
pub ingest_max_body_bytes: usize,
#[serde(default = "default_ingest_max_files")]
pub ingest_max_files: usize,
#[serde(default = "default_ingest_max_content_bytes")]
pub ingest_max_content_bytes: usize,
#[serde(default = "default_ingest_max_context_bytes")]
pub ingest_max_context_bytes: usize,
#[serde(default = "default_ingest_max_category_bytes")]
pub ingest_max_category_bytes: usize,
} }
/// Default data directory for persisted assets. /// Default data directory for persisted assets.
@@ -103,6 +113,26 @@ fn default_reranking_enabled() -> bool {
false false
} }
fn default_ingest_max_body_bytes() -> usize {
20_000_000
}
fn default_ingest_max_files() -> usize {
5
}
fn default_ingest_max_content_bytes() -> usize {
262_144
}
fn default_ingest_max_context_bytes() -> usize {
16_384
}
fn default_ingest_max_category_bytes() -> usize {
128
}
pub fn ensure_ort_path() { pub fn ensure_ort_path() {
if env::var_os("ORT_DYLIB_PATH").is_some() { if env::var_os("ORT_DYLIB_PATH").is_some() {
return; return;
@@ -157,6 +187,11 @@ impl Default for AppConfig {
fastembed_max_length: None, fastembed_max_length: None,
retrieval_strategy: None, retrieval_strategy: None,
embedding_backend: EmbeddingBackend::default(), embedding_backend: EmbeddingBackend::default(),
ingest_max_body_bytes: default_ingest_max_body_bytes(),
ingest_max_files: default_ingest_max_files(),
ingest_max_content_bytes: default_ingest_max_content_bytes(),
ingest_max_context_bytes: default_ingest_max_context_bytes(),
ingest_max_category_bytes: default_ingest_max_category_bytes(),
} }
} }
} }

View File

@@ -0,0 +1,113 @@
use super::config::AppConfig;
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum IngestValidationError {
PayloadTooLarge(String),
BadRequest(String),
}
pub fn validate_ingest_input(
config: &AppConfig,
content: Option<&str>,
context: &str,
category: &str,
file_count: usize,
) -> Result<(), IngestValidationError> {
if file_count > config.ingest_max_files {
return Err(IngestValidationError::BadRequest(format!(
"Too many files. Maximum allowed is {}",
config.ingest_max_files
)));
}
if let Some(content) = content {
if content.len() > config.ingest_max_content_bytes {
return Err(IngestValidationError::PayloadTooLarge(format!(
"Content is too large. Maximum allowed is {} bytes",
config.ingest_max_content_bytes
)));
}
}
if context.len() > config.ingest_max_context_bytes {
return Err(IngestValidationError::PayloadTooLarge(format!(
"Context is too large. Maximum allowed is {} bytes",
config.ingest_max_context_bytes
)));
}
if category.len() > config.ingest_max_category_bytes {
return Err(IngestValidationError::PayloadTooLarge(format!(
"Category is too large. Maximum allowed is {} bytes",
config.ingest_max_category_bytes
)));
}
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn validate_ingest_input_rejects_too_many_files() {
let config = AppConfig {
ingest_max_files: 1,
..Default::default()
};
let result = validate_ingest_input(&config, Some("ok"), "ctx", "cat", 2);
assert!(matches!(result, Err(IngestValidationError::BadRequest(_))));
}
#[test]
fn validate_ingest_input_rejects_oversized_content() {
let config = AppConfig {
ingest_max_content_bytes: 4,
..Default::default()
};
let result = validate_ingest_input(&config, Some("12345"), "ctx", "cat", 0);
assert!(matches!(
result,
Err(IngestValidationError::PayloadTooLarge(_))
));
}
#[test]
fn validate_ingest_input_rejects_oversized_context() {
let config = AppConfig {
ingest_max_context_bytes: 2,
..Default::default()
};
let result = validate_ingest_input(&config, None, "long", "cat", 0);
assert!(matches!(
result,
Err(IngestValidationError::PayloadTooLarge(_))
));
}
#[test]
fn validate_ingest_input_rejects_oversized_category() {
let config = AppConfig {
ingest_max_category_bytes: 2,
..Default::default()
};
let result = validate_ingest_input(&config, None, "ok", "long", 0);
assert!(matches!(
result,
Err(IngestValidationError::PayloadTooLarge(_))
));
}
#[test]
fn validate_ingest_input_accepts_valid_payload() {
let config = AppConfig::default();
let result = validate_ingest_input(&config, Some("ok"), "ctx", "cat", 1);
assert!(result.is_ok());
}
}

View File

@@ -1,3 +1,4 @@
pub mod config; pub mod config;
pub mod embedding; pub mod embedding;
pub mod ingest_limits;
pub mod template_engine; pub mod template_engine;

View File

@@ -29,6 +29,11 @@ Minne can be configured via environment variables or a `config.yaml` file. Envir
| `FASTEMBED_CACHE_DIR` | Model cache directory | `<data_dir>/fastembed` | | `FASTEMBED_CACHE_DIR` | Model cache directory | `<data_dir>/fastembed` |
| `FASTEMBED_SHOW_DOWNLOAD_PROGRESS` | Show progress bar for model downloads | `false` | | `FASTEMBED_SHOW_DOWNLOAD_PROGRESS` | Show progress bar for model downloads | `false` |
| `FASTEMBED_MAX_LENGTH` | Max sequence length for FastEmbed models | - | | `FASTEMBED_MAX_LENGTH` | Max sequence length for FastEmbed models | - |
| `INGEST_MAX_BODY_BYTES` | Max request body size for ingest endpoints | `20000000` |
| `INGEST_MAX_FILES` | Max files allowed per ingest request | `5` |
| `INGEST_MAX_CONTENT_BYTES` | Max `content` field size for ingest requests | `262144` |
| `INGEST_MAX_CONTEXT_BYTES` | Max `context` field size for ingest requests | `16384` |
| `INGEST_MAX_CATEGORY_BYTES` | Max `category` field size for ingest requests | `128` |
### S3 Storage (Optional) ### S3 Storage (Optional)
@@ -76,6 +81,13 @@ embedding_backend: "fastembed"
# Optional reranking # Optional reranking
reranking_enabled: true reranking_enabled: true
reranking_pool_size: 2 reranking_pool_size: 2
# Ingest safety limits
ingest_max_body_bytes: 20000000
ingest_max_files: 5
ingest_max_content_bytes: 262144
ingest_max_context_bytes: 16384
ingest_max_category_bytes: 128
``` ```
## AI Provider Setup ## AI Provider Setup

View File

@@ -35,7 +35,9 @@ where
.add_protected_routes(routes::chat::router()) .add_protected_routes(routes::chat::router())
.add_protected_routes(routes::content::router()) .add_protected_routes(routes::content::router())
.add_protected_routes(routes::knowledge::router()) .add_protected_routes(routes::knowledge::router())
.add_protected_routes(routes::ingestion::router()) .add_protected_routes(routes::ingestion::router(
app_state.config.ingest_max_body_bytes,
))
.add_protected_routes(routes::scratchpad::router()) .add_protected_routes(routes::scratchpad::router())
.with_compression() .with_compression()
.build() .build()

View File

@@ -2,9 +2,10 @@ use std::{pin::Pin, time::Duration};
use axum::{ use axum::{
extract::{Query, State}, extract::{Query, State},
http::StatusCode,
response::{ response::{
sse::{Event, KeepAlive}, sse::{Event, KeepAlive},
Html, IntoResponse, Sse, Html, IntoResponse, Response, Sse,
}, },
}; };
use axum_typed_multipart::{FieldData, TryFromMultipart, TypedMultipart}; use axum_typed_multipart::{FieldData, TryFromMultipart, TypedMultipart};
@@ -23,6 +24,7 @@ use common::{
ingestion_task::{IngestionTask, TaskState}, ingestion_task::{IngestionTask, TaskState},
user::User, user::User,
}, },
utils::ingest_limits::{validate_ingest_input, IngestValidationError},
}; };
use crate::{ use crate::{
@@ -34,30 +36,32 @@ use crate::{
AuthSessionType, AuthSessionType,
}; };
pub async fn show_ingress_form( pub async fn show_ingest_form(
State(state): State<HtmlState>, State(state): State<HtmlState>,
RequireUser(user): RequireUser, RequireUser(user): RequireUser,
) -> Result<impl IntoResponse, HtmlError> { ) -> Result<impl IntoResponse, HtmlError> {
let user_categories = User::get_user_categories(&user.id, &state.db).await?; let user_categories = User::get_user_categories(&user.id, &state.db).await?;
#[derive(Serialize)] #[derive(Serialize)]
pub struct ShowIngressFormData { pub struct ShowIngestFormData {
user_categories: Vec<String>, user_categories: Vec<String>,
} }
Ok(TemplateResponse::new_template( Ok(TemplateResponse::new_template(
"ingestion_modal.html", "ingestion_modal.html",
ShowIngressFormData { user_categories }, ShowIngestFormData { user_categories },
)) ))
} }
pub async fn hide_ingress_form( pub async fn hide_ingest_form(
RequireUser(_user): RequireUser, RequireUser(_user): RequireUser,
) -> Result<impl IntoResponse, HtmlError> { ) -> Result<impl IntoResponse, HtmlError> {
Ok(Html( Ok(
"<a class='btn btn-primary' hx-get='/ingress-form' hx-swap='outerHTML'>Add Content</a>", Html(
"<a class='btn btn-primary' hx-get='/ingest-form' hx-swap='outerHTML'>Add Content</a>",
)
.into_response(),
) )
.into_response())
} }
#[derive(Debug, TryFromMultipart)] #[derive(Debug, TryFromMultipart)]
@@ -65,34 +69,22 @@ pub struct IngestionParams {
pub content: Option<String>, pub content: Option<String>,
pub context: String, pub context: String,
pub category: String, pub category: String,
#[form_data(limit = "10000000")] // Adjust limit as needed #[form_data(limit = "20000000")]
#[form_data(default)] #[form_data(default)]
pub files: Vec<FieldData<NamedTempFile>>, pub files: Vec<FieldData<NamedTempFile>>,
} }
pub async fn process_ingress_form( pub async fn process_ingest_form(
State(state): State<HtmlState>, State(state): State<HtmlState>,
RequireUser(user): RequireUser, RequireUser(user): RequireUser,
TypedMultipart(input): TypedMultipart<IngestionParams>, TypedMultipart(input): TypedMultipart<IngestionParams>,
) -> Result<impl IntoResponse, HtmlError> { ) -> Result<Response, HtmlError> {
#[derive(Serialize)]
pub struct IngressFormData {
context: String,
content: String,
category: String,
error: String,
}
if input.content.as_ref().is_none_or(|c| c.len() < 2) && input.files.is_empty() { if input.content.as_ref().is_none_or(|c| c.len() < 2) && input.files.is_empty() {
return Ok(TemplateResponse::new_template( return Ok((
"index/signed_in/ingress_form.html", StatusCode::BAD_REQUEST,
IngressFormData { "You need to either add files or content",
context: input.context.clone(), )
content: input.content.clone().unwrap_or_default(), .into_response());
category: input.category.clone(),
error: "You need to either add files or content".to_string(),
},
));
} }
let content_bytes = input.content.as_ref().map_or(0, |c| c.len()); let content_bytes = input.content.as_ref().map_or(0, |c| c.len());
@@ -101,6 +93,22 @@ pub async fn process_ingress_form(
let category_bytes = input.category.len(); let category_bytes = input.category.len();
let file_count = input.files.len(); let file_count = input.files.len();
match validate_ingest_input(
&state.config,
input.content.as_deref(),
&input.context,
&input.category,
file_count,
) {
Ok(()) => {}
Err(IngestValidationError::PayloadTooLarge(message)) => {
return Ok((StatusCode::PAYLOAD_TOO_LARGE, message).into_response());
}
Err(IngestValidationError::BadRequest(message)) => {
return Ok((StatusCode::BAD_REQUEST, message).into_response());
}
}
info!( info!(
user_id = %user.id, user_id = %user.id,
has_content, has_content,
@@ -108,7 +116,7 @@ pub async fn process_ingress_form(
context_bytes, context_bytes,
category_bytes, category_bytes,
file_count, file_count,
"Received ingestion form submission" "Received ingest form submission"
); );
let file_infos = try_join_all(input.files.into_iter().map(|file| { let file_infos = try_join_all(input.files.into_iter().map(|file| {
@@ -137,10 +145,10 @@ pub async fn process_ingress_form(
tasks: Vec<IngestionTask>, tasks: Vec<IngestionTask>,
} }
Ok(TemplateResponse::new_template( Ok(
"dashboard/current_task.html", TemplateResponse::new_template("dashboard/current_task.html", NewTasksData { tasks })
NewTasksData { tasks }, .into_response(),
)) )
} }
#[derive(Deserialize)] #[derive(Deserialize)]

View File

@@ -1,22 +1,22 @@
mod handlers; mod handlers;
use axum::{extract::FromRef, routing::get, Router}; use axum::{extract::DefaultBodyLimit, extract::FromRef, routing::get, Router};
use handlers::{ use handlers::{get_task_updates_stream, hide_ingest_form, process_ingest_form, show_ingest_form};
get_task_updates_stream, hide_ingress_form, process_ingress_form, show_ingress_form,
};
use crate::html_state::HtmlState; use crate::html_state::HtmlState;
pub fn router<S>() -> Router<S> pub fn router<S>(max_body_bytes: usize) -> Router<S>
where where
S: Clone + Send + Sync + 'static, S: Clone + Send + Sync + 'static,
HtmlState: FromRef<S>, HtmlState: FromRef<S>,
{ {
Router::new() Router::new()
.route( .route(
"/ingress-form", "/ingest-form",
get(show_ingress_form).post(process_ingress_form), get(show_ingest_form)
.post(process_ingest_form)
.layer(DefaultBodyLimit::max(max_body_bytes)),
) )
.route("/task/status-stream", get(get_task_updates_stream)) .route("/task/status-stream", get(get_task_updates_stream))
.route("/hide-ingress-form", get(hide_ingress_form)) .route("/hide-ingest-form", get(hide_ingest_form))
} }

View File

@@ -2,7 +2,7 @@
{% block dashboard_header %} {% block dashboard_header %}
<h1 class="text-xl font-extrabold tracking-tight">Dashboard</h1> <h1 class="text-xl font-extrabold tracking-tight">Dashboard</h1>
<button class="nb-btn nb-cta" hx-get="/ingress-form" hx-target="#modal" hx-swap="innerHTML"> <button class="nb-btn nb-cta" hx-get="/ingest-form" hx-target="#modal" hx-swap="innerHTML">
{% include "icons/send_icon.html" %} {% include "icons/send_icon.html" %}
<span class="ml-2">Add Content</span> <span class="ml-2">Add Content</span>
</button> </button>

View File

@@ -3,7 +3,7 @@
{% block modal_class %}max-w-3xl{% endblock %} {% block modal_class %}max-w-3xl{% endblock %}
{% block form_attributes %} {% block form_attributes %}
hx-post="/ingress-form" hx-post="/ingest-form"
enctype="multipart/form-data" enctype="multipart/form-data"
{% endblock %} {% endblock %}

View File

@@ -18,7 +18,7 @@
</li> </li>
{% endfor %} {% endfor %}
<li> <li>
<button class="nb-btn nb-cta w-full flex items-center gap-3 justify-start mt-2" hx-get="/ingress-form" <button class="nb-btn nb-cta w-full flex items-center gap-3 justify-start mt-2" hx-get="/ingest-form"
hx-target="#modal" hx-swap="innerHTML">{% include "icons/send_icon.html" %} Add hx-target="#modal" hx-swap="innerHTML">{% include "icons/send_icon.html" %} Add
Content</button> Content</button>
</li> </li>