fix: name harmonization of endpoints & ingestion security hardening

This commit is contained in:
Per Stark
2026-02-13 22:36:00 +01:00
parent f22cac891c
commit e07199adfc
15 changed files with 258 additions and 53 deletions

View File

@@ -1,8 +1,7 @@
# Changelog
## Unreleased
- Fix: edge case where navigation back to a chat page could trigger a new response generation
- Security: hardened storage-layer queries by replacing user-influenced string interpolation with bound parameters and adding injection regression tests.
- Security: removed raw ingestion payload logging from API/HTML ingress handlers and replaced it with metadata-only structured logs.
- Security: Misc security fixes
## 1.0.1 (2026-02-11)
- Shipped an S3 storage backend so content can be stored in object storage instead of local disk, with configuration support for S3 deployments.

View File

@@ -20,6 +20,9 @@ pub enum ApiError {
#[error("Unauthorized: {0}")]
Unauthorized(String),
#[error("Payload too large: {0}")]
PayloadTooLarge(String),
}
impl From<AppError> for ApiError {
@@ -67,6 +70,13 @@ impl IntoResponse for ApiError {
status: "error".to_string(),
},
),
Self::PayloadTooLarge(message) => (
StatusCode::PAYLOAD_TOO_LARGE,
ErrorResponse {
error: message,
status: "error".to_string(),
},
),
};
(status, Json(error_response)).into_response()
@@ -132,6 +142,10 @@ mod tests {
// Test unauthorized status
let error = ApiError::Unauthorized("not allowed".to_string());
assert_status_code(error, StatusCode::UNAUTHORIZED);
// Test payload too large status
let error = ApiError::PayloadTooLarge("too big".to_string());
assert_status_code(error, StatusCode::PAYLOAD_TOO_LARGE);
}
// Alternative approach that doesn't try to parse the response body

View File

@@ -6,7 +6,7 @@ use axum::{
Router,
};
use middleware_api_auth::api_auth;
use routes::{categories::get_categories, ingress::ingest_data, liveness::live, readiness::ready};
use routes::{categories::get_categories, ingest::ingest_data, liveness::live, readiness::ready};
pub mod api_state;
pub mod error;
@@ -26,9 +26,13 @@ where
// Protected API endpoints (require auth)
let protected = Router::new()
.route("/ingress", post(ingest_data))
.route(
"/ingest",
post(ingest_data).layer(DefaultBodyLimit::max(
app_state.config.ingest_max_body_bytes,
)),
)
.route("/categories", get(get_categories))
.layer(DefaultBodyLimit::max(1024 * 1024 * 1024))
.route_layer(from_fn_with_state(app_state.clone(), api_auth));
public.merge(protected)

View File

@@ -6,6 +6,7 @@ use common::{
file_info::FileInfo, ingestion_payload::IngestionPayload, ingestion_task::IngestionTask,
user::User,
},
utils::ingest_limits::{validate_ingest_input, IngestValidationError},
};
use futures::{future::try_join_all, TryFutureExt};
use serde_json::json;
@@ -19,7 +20,7 @@ pub struct IngestParams {
pub content: Option<String>,
pub context: String,
pub category: String,
#[form_data(limit = "10000000")] // Adjust limit as needed
#[form_data(limit = "20000000")]
#[form_data(default)]
pub files: Vec<FieldData<NamedTempFile>>,
}
@@ -36,6 +37,22 @@ pub async fn ingest_data(
let category_bytes = input.category.len();
let file_count = input.files.len();
match validate_ingest_input(
&state.config,
input.content.as_deref(),
&input.context,
&input.category,
file_count,
) {
Ok(()) => {}
Err(IngestValidationError::PayloadTooLarge(message)) => {
return Err(ApiError::PayloadTooLarge(message));
}
Err(IngestValidationError::BadRequest(message)) => {
return Err(ApiError::ValidationError(message));
}
}
info!(
user_id = %user_id,
has_content,
@@ -43,7 +60,7 @@ pub async fn ingest_data(
context_bytes,
category_bytes,
file_count,
"Received ingestion request"
"Received ingest request"
);
let file_infos = try_join_all(input.files.into_iter().map(|file| {

View File

@@ -1,4 +1,4 @@
pub mod categories;
pub mod ingress;
pub mod ingest;
pub mod liveness;
pub mod readiness;

View File

@@ -86,6 +86,16 @@ pub struct AppConfig {
pub retrieval_strategy: Option<String>,
#[serde(default)]
pub embedding_backend: EmbeddingBackend,
#[serde(default = "default_ingest_max_body_bytes")]
pub ingest_max_body_bytes: usize,
#[serde(default = "default_ingest_max_files")]
pub ingest_max_files: usize,
#[serde(default = "default_ingest_max_content_bytes")]
pub ingest_max_content_bytes: usize,
#[serde(default = "default_ingest_max_context_bytes")]
pub ingest_max_context_bytes: usize,
#[serde(default = "default_ingest_max_category_bytes")]
pub ingest_max_category_bytes: usize,
}
/// Default data directory for persisted assets.
@@ -103,6 +113,26 @@ fn default_reranking_enabled() -> bool {
false
}
fn default_ingest_max_body_bytes() -> usize {
20_000_000
}
fn default_ingest_max_files() -> usize {
5
}
fn default_ingest_max_content_bytes() -> usize {
262_144
}
fn default_ingest_max_context_bytes() -> usize {
16_384
}
fn default_ingest_max_category_bytes() -> usize {
128
}
pub fn ensure_ort_path() {
if env::var_os("ORT_DYLIB_PATH").is_some() {
return;
@@ -157,6 +187,11 @@ impl Default for AppConfig {
fastembed_max_length: None,
retrieval_strategy: None,
embedding_backend: EmbeddingBackend::default(),
ingest_max_body_bytes: default_ingest_max_body_bytes(),
ingest_max_files: default_ingest_max_files(),
ingest_max_content_bytes: default_ingest_max_content_bytes(),
ingest_max_context_bytes: default_ingest_max_context_bytes(),
ingest_max_category_bytes: default_ingest_max_category_bytes(),
}
}
}

View File

@@ -0,0 +1,113 @@
use super::config::AppConfig;
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum IngestValidationError {
PayloadTooLarge(String),
BadRequest(String),
}
pub fn validate_ingest_input(
config: &AppConfig,
content: Option<&str>,
context: &str,
category: &str,
file_count: usize,
) -> Result<(), IngestValidationError> {
if file_count > config.ingest_max_files {
return Err(IngestValidationError::BadRequest(format!(
"Too many files. Maximum allowed is {}",
config.ingest_max_files
)));
}
if let Some(content) = content {
if content.len() > config.ingest_max_content_bytes {
return Err(IngestValidationError::PayloadTooLarge(format!(
"Content is too large. Maximum allowed is {} bytes",
config.ingest_max_content_bytes
)));
}
}
if context.len() > config.ingest_max_context_bytes {
return Err(IngestValidationError::PayloadTooLarge(format!(
"Context is too large. Maximum allowed is {} bytes",
config.ingest_max_context_bytes
)));
}
if category.len() > config.ingest_max_category_bytes {
return Err(IngestValidationError::PayloadTooLarge(format!(
"Category is too large. Maximum allowed is {} bytes",
config.ingest_max_category_bytes
)));
}
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn validate_ingest_input_rejects_too_many_files() {
let config = AppConfig {
ingest_max_files: 1,
..Default::default()
};
let result = validate_ingest_input(&config, Some("ok"), "ctx", "cat", 2);
assert!(matches!(result, Err(IngestValidationError::BadRequest(_))));
}
#[test]
fn validate_ingest_input_rejects_oversized_content() {
let config = AppConfig {
ingest_max_content_bytes: 4,
..Default::default()
};
let result = validate_ingest_input(&config, Some("12345"), "ctx", "cat", 0);
assert!(matches!(
result,
Err(IngestValidationError::PayloadTooLarge(_))
));
}
#[test]
fn validate_ingest_input_rejects_oversized_context() {
let config = AppConfig {
ingest_max_context_bytes: 2,
..Default::default()
};
let result = validate_ingest_input(&config, None, "long", "cat", 0);
assert!(matches!(
result,
Err(IngestValidationError::PayloadTooLarge(_))
));
}
#[test]
fn validate_ingest_input_rejects_oversized_category() {
let config = AppConfig {
ingest_max_category_bytes: 2,
..Default::default()
};
let result = validate_ingest_input(&config, None, "ok", "long", 0);
assert!(matches!(
result,
Err(IngestValidationError::PayloadTooLarge(_))
));
}
#[test]
fn validate_ingest_input_accepts_valid_payload() {
let config = AppConfig::default();
let result = validate_ingest_input(&config, Some("ok"), "ctx", "cat", 1);
assert!(result.is_ok());
}
}

View File

@@ -1,3 +1,4 @@
pub mod config;
pub mod embedding;
pub mod ingest_limits;
pub mod template_engine;

View File

@@ -29,6 +29,11 @@ Minne can be configured via environment variables or a `config.yaml` file. Envir
| `FASTEMBED_CACHE_DIR` | Model cache directory | `<data_dir>/fastembed` |
| `FASTEMBED_SHOW_DOWNLOAD_PROGRESS` | Show progress bar for model downloads | `false` |
| `FASTEMBED_MAX_LENGTH` | Max sequence length for FastEmbed models | - |
| `INGEST_MAX_BODY_BYTES` | Max request body size for ingest endpoints | `20000000` |
| `INGEST_MAX_FILES` | Max files allowed per ingest request | `5` |
| `INGEST_MAX_CONTENT_BYTES` | Max `content` field size for ingest requests | `262144` |
| `INGEST_MAX_CONTEXT_BYTES` | Max `context` field size for ingest requests | `16384` |
| `INGEST_MAX_CATEGORY_BYTES` | Max `category` field size for ingest requests | `128` |
### S3 Storage (Optional)
@@ -76,6 +81,13 @@ embedding_backend: "fastembed"
# Optional reranking
reranking_enabled: true
reranking_pool_size: 2
# Ingest safety limits
ingest_max_body_bytes: 20000000
ingest_max_files: 5
ingest_max_content_bytes: 262144
ingest_max_context_bytes: 16384
ingest_max_category_bytes: 128
```
## AI Provider Setup

View File

@@ -35,7 +35,9 @@ where
.add_protected_routes(routes::chat::router())
.add_protected_routes(routes::content::router())
.add_protected_routes(routes::knowledge::router())
.add_protected_routes(routes::ingestion::router())
.add_protected_routes(routes::ingestion::router(
app_state.config.ingest_max_body_bytes,
))
.add_protected_routes(routes::scratchpad::router())
.with_compression()
.build()

View File

@@ -2,9 +2,10 @@ use std::{pin::Pin, time::Duration};
use axum::{
extract::{Query, State},
http::StatusCode,
response::{
sse::{Event, KeepAlive},
Html, IntoResponse, Sse,
Html, IntoResponse, Response, Sse,
},
};
use axum_typed_multipart::{FieldData, TryFromMultipart, TypedMultipart};
@@ -23,6 +24,7 @@ use common::{
ingestion_task::{IngestionTask, TaskState},
user::User,
},
utils::ingest_limits::{validate_ingest_input, IngestValidationError},
};
use crate::{
@@ -34,30 +36,32 @@ use crate::{
AuthSessionType,
};
pub async fn show_ingress_form(
pub async fn show_ingest_form(
State(state): State<HtmlState>,
RequireUser(user): RequireUser,
) -> Result<impl IntoResponse, HtmlError> {
let user_categories = User::get_user_categories(&user.id, &state.db).await?;
#[derive(Serialize)]
pub struct ShowIngressFormData {
pub struct ShowIngestFormData {
user_categories: Vec<String>,
}
Ok(TemplateResponse::new_template(
"ingestion_modal.html",
ShowIngressFormData { user_categories },
ShowIngestFormData { user_categories },
))
}
pub async fn hide_ingress_form(
pub async fn hide_ingest_form(
RequireUser(_user): RequireUser,
) -> Result<impl IntoResponse, HtmlError> {
Ok(Html(
"<a class='btn btn-primary' hx-get='/ingress-form' hx-swap='outerHTML'>Add Content</a>",
Ok(
Html(
"<a class='btn btn-primary' hx-get='/ingest-form' hx-swap='outerHTML'>Add Content</a>",
)
.into_response(),
)
.into_response())
}
#[derive(Debug, TryFromMultipart)]
@@ -65,34 +69,22 @@ pub struct IngestionParams {
pub content: Option<String>,
pub context: String,
pub category: String,
#[form_data(limit = "10000000")] // Adjust limit as needed
#[form_data(limit = "20000000")]
#[form_data(default)]
pub files: Vec<FieldData<NamedTempFile>>,
}
pub async fn process_ingress_form(
pub async fn process_ingest_form(
State(state): State<HtmlState>,
RequireUser(user): RequireUser,
TypedMultipart(input): TypedMultipart<IngestionParams>,
) -> Result<impl IntoResponse, HtmlError> {
#[derive(Serialize)]
pub struct IngressFormData {
context: String,
content: String,
category: String,
error: String,
}
) -> Result<Response, HtmlError> {
if input.content.as_ref().is_none_or(|c| c.len() < 2) && input.files.is_empty() {
return Ok(TemplateResponse::new_template(
"index/signed_in/ingress_form.html",
IngressFormData {
context: input.context.clone(),
content: input.content.clone().unwrap_or_default(),
category: input.category.clone(),
error: "You need to either add files or content".to_string(),
},
));
return Ok((
StatusCode::BAD_REQUEST,
"You need to either add files or content",
)
.into_response());
}
let content_bytes = input.content.as_ref().map_or(0, |c| c.len());
@@ -101,6 +93,22 @@ pub async fn process_ingress_form(
let category_bytes = input.category.len();
let file_count = input.files.len();
match validate_ingest_input(
&state.config,
input.content.as_deref(),
&input.context,
&input.category,
file_count,
) {
Ok(()) => {}
Err(IngestValidationError::PayloadTooLarge(message)) => {
return Ok((StatusCode::PAYLOAD_TOO_LARGE, message).into_response());
}
Err(IngestValidationError::BadRequest(message)) => {
return Ok((StatusCode::BAD_REQUEST, message).into_response());
}
}
info!(
user_id = %user.id,
has_content,
@@ -108,7 +116,7 @@ pub async fn process_ingress_form(
context_bytes,
category_bytes,
file_count,
"Received ingestion form submission"
"Received ingest form submission"
);
let file_infos = try_join_all(input.files.into_iter().map(|file| {
@@ -137,10 +145,10 @@ pub async fn process_ingress_form(
tasks: Vec<IngestionTask>,
}
Ok(TemplateResponse::new_template(
"dashboard/current_task.html",
NewTasksData { tasks },
))
Ok(
TemplateResponse::new_template("dashboard/current_task.html", NewTasksData { tasks })
.into_response(),
)
}
#[derive(Deserialize)]

View File

@@ -1,22 +1,22 @@
mod handlers;
use axum::{extract::FromRef, routing::get, Router};
use handlers::{
get_task_updates_stream, hide_ingress_form, process_ingress_form, show_ingress_form,
};
use axum::{extract::DefaultBodyLimit, extract::FromRef, routing::get, Router};
use handlers::{get_task_updates_stream, hide_ingest_form, process_ingest_form, show_ingest_form};
use crate::html_state::HtmlState;
pub fn router<S>() -> Router<S>
pub fn router<S>(max_body_bytes: usize) -> Router<S>
where
S: Clone + Send + Sync + 'static,
HtmlState: FromRef<S>,
{
Router::new()
.route(
"/ingress-form",
get(show_ingress_form).post(process_ingress_form),
"/ingest-form",
get(show_ingest_form)
.post(process_ingest_form)
.layer(DefaultBodyLimit::max(max_body_bytes)),
)
.route("/task/status-stream", get(get_task_updates_stream))
.route("/hide-ingress-form", get(hide_ingress_form))
.route("/hide-ingest-form", get(hide_ingest_form))
}

View File

@@ -2,7 +2,7 @@
{% block dashboard_header %}
<h1 class="text-xl font-extrabold tracking-tight">Dashboard</h1>
<button class="nb-btn nb-cta" hx-get="/ingress-form" hx-target="#modal" hx-swap="innerHTML">
<button class="nb-btn nb-cta" hx-get="/ingest-form" hx-target="#modal" hx-swap="innerHTML">
{% include "icons/send_icon.html" %}
<span class="ml-2">Add Content</span>
</button>

View File

@@ -3,7 +3,7 @@
{% block modal_class %}max-w-3xl{% endblock %}
{% block form_attributes %}
hx-post="/ingress-form"
hx-post="/ingest-form"
enctype="multipart/form-data"
{% endblock %}

View File

@@ -18,7 +18,7 @@
</li>
{% endfor %}
<li>
<button class="nb-btn nb-cta w-full flex items-center gap-3 justify-start mt-2" hx-get="/ingress-form"
<button class="nb-btn nb-cta w-full flex items-center gap-3 justify-start mt-2" hx-get="/ingest-form"
hx-target="#modal" hx-swap="innerHTML">{% include "icons/send_icon.html" %} Add
Content</button>
</li>