From f2fa5bbbcc028502038cf7cf22e054ad84408ea7 Mon Sep 17 00:00:00 2001 From: Per Stark Date: Sat, 17 Jan 2026 23:31:16 +0100 Subject: [PATCH] fix: edge case when deleting content nit --- common/src/storage/types/knowledge_entity.rs | 59 +++++++++++++++-- common/src/storage/types/text_chunk.rs | 13 ++-- .../src/storage/types/text_chunk_embedding.rs | 37 ++--------- html-router/src/router_factory.rs | 63 ++++++++++--------- 4 files changed, 102 insertions(+), 70 deletions(-) diff --git a/common/src/storage/types/knowledge_entity.rs b/common/src/storage/types/knowledge_entity.rs index be47313..9acc36d 100644 --- a/common/src/storage/types/knowledge_entity.rs +++ b/common/src/storage/types/knowledge_entity.rs @@ -171,6 +171,9 @@ impl KnowledgeEntity { source_id: &str, db_client: &SurrealDbClient, ) -> Result<(), AppError> { + // Delete embeddings first, while we can still look them up via the entity's source_id + KnowledgeEntityEmbedding::delete_by_source_id(source_id, db_client).await?; + let query = format!( "DELETE {} WHERE source_id = '{}'", Self::table_name(), @@ -224,7 +227,7 @@ impl KnowledgeEntity { ) -> Result, AppError> { #[derive(Deserialize)] struct Row { - entity_id: KnowledgeEntity, + entity_id: Option, score: f32, } @@ -257,9 +260,11 @@ impl KnowledgeEntity { Ok(rows .into_iter() - .map(|r| KnowledgeEntityVectorResult { - entity: r.entity_id, - score: r.score, + .filter_map(|r| { + r.entity_id.map(|entity| KnowledgeEntityVectorResult { + entity, + score: r.score, + }) }) .collect()) } @@ -914,4 +919,50 @@ mod tests { assert_eq!(results[0].entity.id, e2.id); assert_eq!(results[1].entity.id, e1.id); } + + #[tokio::test] + async fn test_vector_search_with_orphaned_embedding() { + let namespace = "test_ns_orphan"; + let database = &Uuid::new_v4().to_string(); + let db = SurrealDbClient::memory(namespace, database) + .await + .expect("Failed to start in-memory surrealdb"); + db.apply_migrations() + .await + .expect("Failed to apply migrations"); + + KnowledgeEntityEmbedding::redefine_hnsw_index(&db, 3) + .await + .expect("Failed to redefine index length"); + + let user_id = "user".to_string(); + let source_id = "src".to_string(); + let entity = KnowledgeEntity::new( + source_id.clone(), + "orphan".to_string(), + "orphan desc".to_string(), + KnowledgeEntityType::Document, + None, + user_id.clone(), + ); + + KnowledgeEntity::store_with_embedding(entity.clone(), vec![0.1, 0.2, 0.3], &db) + .await + .expect("store entity with embedding"); + + // Manually delete the entity to create an orphan + let query = format!("DELETE type::thing('knowledge_entity', '{}')", entity.id); + db.client.query(query).await.expect("delete entity"); + + // Now search + let results = KnowledgeEntity::vector_search(3, vec![0.1, 0.2, 0.3], &db, &user_id) + .await + .expect("search should succeed even with orphans"); + + assert!( + results.is_empty(), + "Should return empty result for orphan, got: {:?}", + results + ); + } } diff --git a/common/src/storage/types/text_chunk.rs b/common/src/storage/types/text_chunk.rs index 1f876c0..e2d7ee3 100644 --- a/common/src/storage/types/text_chunk.rs +++ b/common/src/storage/types/text_chunk.rs @@ -44,6 +44,9 @@ impl TextChunk { source_id: &str, db_client: &SurrealDbClient, ) -> Result<(), AppError> { + // Delete embeddings first + TextChunkEmbedding::delete_by_source_id(source_id, db_client).await?; + let query = format!( "DELETE {} WHERE source_id = '{}'", Self::table_name(), @@ -102,7 +105,7 @@ impl TextChunk { #[allow(clippy::missing_docs_in_private_items)] #[derive(Deserialize)] struct Row { - chunk_id: TextChunk, + chunk_id: Option, score: f32, } @@ -134,9 +137,11 @@ impl TextChunk { Ok(rows .into_iter() - .map(|r| TextChunkSearchResult { - chunk: r.chunk_id, - score: r.score, + .filter_map(|r| { + r.chunk_id.map(|chunk| TextChunkSearchResult { + chunk, + score: r.score, + }) }) .collect()) } diff --git a/common/src/storage/types/text_chunk_embedding.rs b/common/src/storage/types/text_chunk_embedding.rs index 734eadd..90a3da6 100644 --- a/common/src/storage/types/text_chunk_embedding.rs +++ b/common/src/storage/types/text_chunk_embedding.rs @@ -102,44 +102,19 @@ impl TextChunkEmbedding { /// Delete all embeddings that belong to chunks with a given `source_id` /// - /// This uses a subquery to the `text_chunk` table: - /// - /// DELETE FROM text_chunk_embedding - /// WHERE chunk_id IN (SELECT id FROM text_chunk WHERE source_id = $source_id) + /// This uses the denormalized `source_id` on the embedding table. pub async fn delete_by_source_id( source_id: &str, db: &SurrealDbClient, ) -> Result<(), AppError> { - #[allow(clippy::missing_docs_in_private_items)] - #[derive(Deserialize)] - struct IdRow { - id: RecordId, - } - let ids_query = format!( - "SELECT id FROM {} WHERE source_id = $source_id", - TextChunk::table_name() - ); - let mut res = db - .client - .query(ids_query) - .bind(("source_id", source_id.to_owned())) - .await - .map_err(AppError::Database)?; - let ids: Vec = res.take(0).map_err(AppError::Database)?; - - if ids.is_empty() { - return Ok(()); - } - let delete_query = format!( - "DELETE FROM {} WHERE chunk_id IN $chunk_ids", + let query = format!( + "DELETE FROM {} WHERE source_id = $source_id", Self::table_name() ); + db.client - .query(delete_query) - .bind(( - "chunk_ids", - ids.into_iter().map(|row| row.id).collect::>(), - )) + .query(query) + .bind(("source_id", source_id.to_owned())) .await .map_err(AppError::Database)? .check() diff --git a/html-router/src/router_factory.rs b/html-router/src/router_factory.rs index 08c932d..eccb011 100644 --- a/html-router/src/router_factory.rs +++ b/html-router/src/router_factory.rs @@ -120,79 +120,80 @@ where } pub fn build(self) -> Router { - // Start with an empty router - let mut public_router = Router::new(); + // Build the "App" router (Pages, API interactions, etc.) + let mut app_router = Router::new(); - // Merge all public routers + // Merge all public routers (pages) for router in self.public_routers { - public_router = public_router.merge(router); + app_router = app_router.merge(router); } // Add nested public routes for (path, router) in self.nested_routes { - public_router = public_router.nest(&path, router); + app_router = app_router.nest(&path, router); } - // Add public assets to public router - if let Some(assets_config) = self.public_assets_config { - // Call the macro using the stored relative directory path - let asset_service = create_asset_service!(&assets_config.directory); - // Nest the resulting service under the stored URL path - public_router = public_router.nest_service(&assets_config.path, asset_service); - } - - // Start with an empty protected router + // Build protected router logic... let mut protected_router = Router::new(); - - // Check if there are any protected routers let has_protected_routes = !self.protected_routers.is_empty() || !self.nested_protected_routes.is_empty(); - // Merge root-level protected routers for router in self.protected_routers { protected_router = protected_router.merge(router); } - // Nest protected routers for (path, router) in self.nested_protected_routes { protected_router = protected_router.nest(&path, router); } - // Apply auth middleware if has_protected_routes { protected_router = protected_router .route_layer(from_fn_with_state(self.app_state.clone(), require_auth)); } - // Combine public and protected routes - let mut router = Router::new().merge(public_router).merge(protected_router); + // Combine public and protected routes into the App router + app_router = app_router.merge(protected_router); - // Apply custom middleware in order they were added + // Apply custom middleware to the App router for middleware_fn in self.custom_middleware { - router = middleware_fn(router); + app_router = middleware_fn(app_router); } - // Apply common middleware - router = router.layer(from_fn_with_state( + // Apply App-specific Middleware (Analytics, Template, Auth, Session) + app_router = app_router.layer(from_fn_with_state( self.app_state.clone(), analytics_middleware::, )); - router = router.layer(from_fn_with_state( + app_router = app_router.layer(from_fn_with_state( self.app_state.clone(), with_template_response::, )); - router = router.layer( + app_router = app_router.layer( AuthSessionLayer::, Surreal>::new(Some( self.app_state.db.client.clone(), )) .with_config(AuthConfig::::default()), ); - router = router.layer(SessionLayer::new((*self.app_state.session_store).clone())); + app_router = app_router.layer(SessionLayer::new((*self.app_state.session_store).clone())); - if self.compression_enabled { - router = router.layer(compression_layer()); + // Build the Final router, starting with assets (bypassing app middleware) + let mut final_router = Router::new(); + + if let Some(assets_config) = self.public_assets_config { + // Call the macro using the stored relative directory path + let asset_service = create_asset_service!(&assets_config.directory); + // Nest the resulting service under the stored URL path + final_router = final_router.nest_service(&assets_config.path, asset_service); } - router + // Merge the App router + final_router = final_router.merge(app_router); + + // Apply Global Middleware (Compression) + if self.compression_enabled { + final_router = final_router.layer(compression_layer()); + } + + final_router } }