fix: removed stale embeddings handler

This commit is contained in:
Per Stark
2025-11-29 20:07:48 +01:00
parent 1039ec32a4
commit 2939e4c2a4
6 changed files with 3 additions and 226 deletions

View File

@@ -439,7 +439,6 @@ impl Config {
pub struct ParsedArgs {
pub config: Config,
pub show_help: bool,
}
pub fn parse() -> Result<ParsedArgs> {
@@ -447,7 +446,6 @@ pub fn parse() -> Result<ParsedArgs> {
config.finalize()?;
Ok(ParsedArgs {
config,
show_help: false, // Clap handles help automatically
})
}

View File

@@ -1,6 +1,5 @@
use anyhow::{Context, Result};
use common::storage::{db::SurrealDbClient, indexes::ensure_runtime_indexes};
use serde::Deserialize;
use tracing::info;
// Remove and recreate HNSW indexes for changing embedding lengths, used at beginning if embedding length differs from default system settings.
@@ -50,6 +49,7 @@ pub async fn reset_namespace(db: &SurrealDbClient, namespace: &str, database: &s
#[cfg(test)]
mod tests {
use super::*;
use serde::Deserialize;
use uuid::Uuid;
#[derive(Debug, Deserialize)]

View File

@@ -1,171 +0,0 @@
use std::{
collections::hash_map::DefaultHasher,
hash::{Hash, Hasher},
str::FromStr,
sync::Arc,
};
use anyhow::{anyhow, Context, Result};
use fastembed::{EmbeddingModel, ModelTrait, TextEmbedding, TextInitOptions};
use tokio::sync::Mutex;
use crate::args::{Config, EmbeddingBackend};
#[derive(Clone)]
pub struct EmbeddingProvider {
inner: EmbeddingInner,
}
#[derive(Clone)]
enum EmbeddingInner {
Hashed {
dimension: usize,
},
FastEmbed {
model: Arc<Mutex<TextEmbedding>>,
model_name: EmbeddingModel,
dimension: usize,
},
}
impl EmbeddingProvider {
pub fn backend_label(&self) -> &'static str {
match self.inner {
EmbeddingInner::Hashed { .. } => "hashed",
EmbeddingInner::FastEmbed { .. } => "fastembed",
}
}
pub fn dimension(&self) -> usize {
match &self.inner {
EmbeddingInner::Hashed { dimension } => *dimension,
EmbeddingInner::FastEmbed { dimension, .. } => *dimension,
}
}
pub fn model_code(&self) -> Option<String> {
match &self.inner {
EmbeddingInner::FastEmbed { model_name, .. } => Some(model_name.to_string()),
EmbeddingInner::Hashed { .. } => None,
}
}
pub async fn embed(&self, text: &str) -> Result<Vec<f32>> {
match &self.inner {
EmbeddingInner::Hashed { dimension } => Ok(hashed_embedding(text, *dimension)),
EmbeddingInner::FastEmbed { model, .. } => {
let mut guard = model.lock().await;
let embeddings = guard
.embed(vec![text.to_owned()], None)
.context("generating fastembed vector")?;
embeddings
.into_iter()
.next()
.ok_or_else(|| anyhow!("fastembed returned no embedding for input"))
}
}
}
pub async fn embed_batch(&self, texts: Vec<String>) -> Result<Vec<Vec<f32>>> {
match &self.inner {
EmbeddingInner::Hashed { dimension } => Ok(texts
.into_iter()
.map(|text| hashed_embedding(&text, *dimension))
.collect()),
EmbeddingInner::FastEmbed { model, .. } => {
if texts.is_empty() {
return Ok(Vec::new());
}
let mut guard = model.lock().await;
guard
.embed(texts, None)
.context("generating fastembed batch embeddings")
}
}
}
}
pub async fn build_provider(
config: &Config,
default_dimension: usize,
) -> Result<EmbeddingProvider> {
match config.embedding_backend {
EmbeddingBackend::Hashed => Ok(EmbeddingProvider {
inner: EmbeddingInner::Hashed {
dimension: default_dimension.max(1),
},
}),
EmbeddingBackend::FastEmbed => {
let model_name = if let Some(code) = config.embedding_model.as_deref() {
EmbeddingModel::from_str(code).map_err(|err| anyhow!(err))?
} else {
EmbeddingModel::default()
};
let options =
TextInitOptions::new(model_name.clone()).with_show_download_progress(true);
let model_name_for_task = model_name.clone();
let model_name_code = model_name.to_string();
let (model, dimension) = tokio::task::spawn_blocking(move || -> Result<_> {
let model =
TextEmbedding::try_new(options).context("initialising FastEmbed text model")?;
let info =
EmbeddingModel::get_model_info(&model_name_for_task).ok_or_else(|| {
anyhow!("FastEmbed model metadata missing for {model_name_code}")
})?;
Ok((model, info.dim))
})
.await
.context("joining FastEmbed initialisation task")??;
Ok(EmbeddingProvider {
inner: EmbeddingInner::FastEmbed {
model: Arc::new(Mutex::new(model)),
model_name,
dimension,
},
})
}
}
}
fn hashed_embedding(text: &str, dimension: usize) -> Vec<f32> {
let dim = dimension.max(1);
let mut vector = vec![0.0f32; dim];
if text.is_empty() {
return vector;
}
let mut token_count = 0f32;
for token in tokens(text) {
token_count += 1.0;
let idx = bucket(&token, dim);
vector[idx] += 1.0;
}
if token_count == 0.0 {
return vector;
}
let norm = vector.iter().map(|v| v * v).sum::<f32>().sqrt();
if norm > 0.0 {
for value in &mut vector {
*value /= norm;
}
}
vector
}
fn tokens(text: &str) -> impl Iterator<Item = String> + '_ {
text.split(|c: char| !c.is_ascii_alphanumeric())
.filter(|token| !token.is_empty())
.map(|token| token.to_ascii_lowercase())
}
fn bucket(token: &str, dimension: usize) -> usize {
let mut hasher = DefaultHasher::new();
token.hash(&mut hasher);
(hasher.finish() as usize) % dimension
}

View File

@@ -1,9 +1,6 @@
use std::path::PathBuf;
use anyhow::Result;
use async_trait::async_trait;
use crate::{args::Config, embedding::EmbeddingProvider};
use crate::args::Config;
#[derive(Debug, Clone)]
pub struct CorpusCacheConfig {
@@ -32,52 +29,6 @@ impl CorpusCacheConfig {
}
}
#[async_trait]
pub trait CorpusEmbeddingProvider: Send + Sync {
fn backend_label(&self) -> &str;
fn model_code(&self) -> Option<String>;
fn dimension(&self) -> usize;
async fn embed_batch(&self, texts: Vec<String>) -> Result<Vec<Vec<f32>>>;
}
#[async_trait]
impl CorpusEmbeddingProvider for EmbeddingProvider {
fn backend_label(&self) -> &str {
EmbeddingProvider::backend_label(self)
}
fn model_code(&self) -> Option<String> {
EmbeddingProvider::model_code(self)
}
fn dimension(&self) -> usize {
EmbeddingProvider::dimension(self)
}
async fn embed_batch(&self, texts: Vec<String>) -> Result<Vec<Vec<f32>>> {
EmbeddingProvider::embed_batch(self, texts).await
}
}
#[async_trait]
impl CorpusEmbeddingProvider for common::utils::embedding::EmbeddingProvider {
fn backend_label(&self) -> &str {
common::utils::embedding::EmbeddingProvider::backend_label(self)
}
fn model_code(&self) -> Option<String> {
common::utils::embedding::EmbeddingProvider::model_code(self)
}
fn dimension(&self) -> usize {
common::utils::embedding::EmbeddingProvider::dimension(self)
}
async fn embed_batch(&self, texts: Vec<String>) -> Result<Vec<Vec<f32>>> {
common::utils::embedding::EmbeddingProvider::embed_batch(self, texts).await
}
}
impl From<&Config> for CorpusCacheConfig {
fn from(config: &Config) -> Self {
CorpusCacheConfig::new(

View File

@@ -2,7 +2,6 @@ mod args;
mod cache;
mod datasets;
mod db_helpers;
mod embedding;
mod eval;
mod ingest;
mod inspection;