mirror of
https://github.com/perstarkse/minne.git
synced 2026-02-20 23:27:39 +01:00
Compare commits
115 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
4d237ff6d9 | ||
|
|
eb928cdb0e | ||
|
|
1490852a09 | ||
|
|
b0b01182d7 | ||
|
|
679308aa1d | ||
|
|
f93c06b347 | ||
|
|
a3f207beb1 | ||
|
|
e07199adfc | ||
|
|
f22cac891c | ||
|
|
b89171d934 | ||
|
|
0133eead63 | ||
|
|
e5d2b6605f | ||
|
|
bbad91d55b | ||
|
|
96846ad664 | ||
|
|
269bcec659 | ||
|
|
7c738c4b30 | ||
|
|
cb88127fcb | ||
|
|
49e1fbd985 | ||
|
|
f2fa5bbbcc | ||
|
|
a3bc6fba98 | ||
|
|
ece744d5a0 | ||
|
|
a9fda67209 | ||
|
|
fa7f407306 | ||
|
|
b25cfb4633 | ||
|
|
0df2b9810c | ||
|
|
354dc727c1 | ||
|
|
037057d108 | ||
|
|
9f17c6c2b0 | ||
|
|
17f252e630 | ||
|
|
db43be1606 | ||
|
|
8e8370b080 | ||
|
|
84695fa0cc | ||
|
|
654add98bc | ||
|
|
244ec0ea25 | ||
|
|
d8416ac711 | ||
|
|
f9f48d1046 | ||
|
|
30b8a65377 | ||
|
|
04faa38ee6 | ||
|
|
cdc62dda30 | ||
|
|
ab8ff8b07a | ||
|
|
79ea007b0a | ||
|
|
a5bc72aedf | ||
|
|
2e2ea0c4ff | ||
|
|
a090a8c76e | ||
|
|
a8d10f265c | ||
|
|
0cb1abc6db | ||
|
|
d1a6d9abdf | ||
|
|
d3fa3be3e5 | ||
|
|
a2c9bb848d | ||
|
|
dd881efbf9 | ||
|
|
2939e4c2a4 | ||
|
|
1039ec32a4 | ||
|
|
cb906c5b53 | ||
|
|
08b1612fcb | ||
|
|
67004c9646 | ||
|
|
030f0fc17d | ||
|
|
226b2db43a | ||
|
|
6f88d87e74 | ||
|
|
bd519ab269 | ||
|
|
f535df7e61 | ||
|
|
6b7befbd04 | ||
|
|
0eda65b07e | ||
|
|
04ee225732 | ||
|
|
13b7ad6f3a | ||
|
|
112a6965a4 | ||
|
|
911e830be5 | ||
|
|
3196e65172 | ||
|
|
380c900c86 | ||
|
|
a99e5ada8b | ||
|
|
b0deabaf3f | ||
|
|
a8f0d9fa88 | ||
|
|
56a1dfddb8 | ||
|
|
863b921fb4 | ||
|
|
f13791cfcf | ||
|
|
75c200b2ba | ||
|
|
1b7c24747a | ||
|
|
241ad9a089 | ||
|
|
72578296db | ||
|
|
a0e9387c76 | ||
|
|
798b1468b6 | ||
|
|
3b805778b4 | ||
|
|
07b3e1a0e8 | ||
|
|
83d39afad4 | ||
|
|
21e4ab1f42 | ||
|
|
3c97d8ead5 | ||
|
|
ab68bccb80 | ||
|
|
99b88c3063 | ||
|
|
44e5d8a2fc | ||
|
|
7332347f1a | ||
|
|
199186e5a3 | ||
|
|
64728468cd | ||
|
|
c3a7e8dc59 | ||
|
|
35ff4e1464 | ||
|
|
2964f1a5a5 | ||
|
|
cb7f625b81 | ||
|
|
dc40cf7663 | ||
|
|
aa0b1462a1 | ||
|
|
41fc7bb99c | ||
|
|
61d8d7abe7 | ||
|
|
b7344644dc | ||
|
|
3742598a6d | ||
|
|
c6a6080e1c | ||
|
|
1159712724 | ||
|
|
e5e1414f54 | ||
|
|
fcc49b1954 | ||
|
|
022f4d8575 | ||
|
|
945a2b7f37 | ||
|
|
ff4ea55cd5 | ||
|
|
c4c76efe92 | ||
|
|
c0fcad5952 | ||
|
|
b0ed69330d | ||
|
|
5cb15dab45 | ||
|
|
7403195df5 | ||
|
|
9faef31387 | ||
|
|
110f7b8a8f |
2
.cargo/config.toml
Normal file
2
.cargo/config.toml
Normal file
@@ -0,0 +1,2 @@
|
||||
[alias]
|
||||
eval = "run -p evaluations --"
|
||||
49
.github/build-setup.yml
vendored
Normal file
49
.github/build-setup.yml
vendored
Normal file
@@ -0,0 +1,49 @@
|
||||
- name: Prepare lib dir
|
||||
run: mkdir -p lib
|
||||
|
||||
# Linux
|
||||
- name: Fetch ONNX Runtime (Linux)
|
||||
if: runner.os == 'Linux'
|
||||
env:
|
||||
ORT_VER: 1.22.0
|
||||
run: |
|
||||
set -euo pipefail
|
||||
ARCH="$(uname -m)"
|
||||
case "$ARCH" in
|
||||
x86_64) URL="https://github.com/microsoft/onnxruntime/releases/download/v${ORT_VER}/onnxruntime-linux-x64-${ORT_VER}.tgz" ;;
|
||||
aarch64) URL="https://github.com/microsoft/onnxruntime/releases/download/v${ORT_VER}/onnxruntime-linux-aarch64-${ORT_VER}.tgz" ;;
|
||||
*) echo "Unsupported arch $ARCH"; exit 1 ;;
|
||||
esac
|
||||
curl -fsSL -o ort.tgz "$URL"
|
||||
tar -xzf ort.tgz
|
||||
cp -v onnxruntime-*/lib/libonnxruntime.so* lib/
|
||||
|
||||
# macOS
|
||||
- name: Fetch ONNX Runtime (macOS)
|
||||
if: runner.os == 'macOS'
|
||||
env:
|
||||
ORT_VER: 1.22.0
|
||||
run: |
|
||||
set -euo pipefail
|
||||
curl -fsSL -o ort.tgz "https://github.com/microsoft/onnxruntime/releases/download/v${ORT_VER}/onnxruntime-osx-universal2-${ORT_VER}.tgz"
|
||||
tar -xzf ort.tgz
|
||||
# copy the main dylib; rename to stable name if needed
|
||||
cp -v onnxruntime-*/lib/libonnxruntime*.dylib lib/
|
||||
# optional: ensure a stable name
|
||||
if [ ! -f lib/libonnxruntime.dylib ]; then
|
||||
cp -v lib/libonnxruntime*.dylib lib/libonnxruntime.dylib
|
||||
fi
|
||||
|
||||
# Windows
|
||||
- name: Fetch ONNX Runtime (Windows)
|
||||
if: runner.os == 'Windows'
|
||||
shell: pwsh
|
||||
env:
|
||||
ORT_VER: 1.22.0
|
||||
run: |
|
||||
$url = "https://github.com/microsoft/onnxruntime/releases/download/v$env:ORT_VER/onnxruntime-win-x64-$env:ORT_VER.zip"
|
||||
Invoke-WebRequest $url -OutFile ort.zip
|
||||
Expand-Archive ort.zip -DestinationPath ort
|
||||
$dll = Get-ChildItem -Recurse -Path ort -Filter onnxruntime.dll | Select-Object -First 1
|
||||
Copy-Item $dll.FullName lib\onnxruntime.dll
|
||||
|
||||
242
.github/workflows/release.yml
vendored
242
.github/workflows/release.yml
vendored
@@ -1,44 +1,8 @@
|
||||
# This file was autogenerated by dist: https://opensource.axo.dev/cargo-dist/
|
||||
#
|
||||
# Copyright 2022-2024, axodotdev
|
||||
# SPDX-License-Identifier: MIT or Apache-2.0
|
||||
#
|
||||
# CI that:
|
||||
#
|
||||
# * checks for a Git Tag that looks like a release
|
||||
# * builds artifacts with dist (archives, installers, hashes)
|
||||
# * uploads those artifacts to temporary workflow zip
|
||||
# * on success, uploads the artifacts to a GitHub Release
|
||||
#
|
||||
# Note that the GitHub Release will be created with a generated
|
||||
# title/body based on your changelogs.
|
||||
|
||||
name: Release
|
||||
permissions:
|
||||
"contents": "write"
|
||||
"packages": "write"
|
||||
contents: write
|
||||
packages: write
|
||||
|
||||
# This task will run whenever you push a git tag that looks like a version
|
||||
# like "1.0.0", "v0.1.0-prerelease.1", "my-app/0.1.0", "releases/v1.0.0", etc.
|
||||
# Various formats will be parsed into a VERSION and an optional PACKAGE_NAME, where
|
||||
# PACKAGE_NAME must be the name of a Cargo package in your workspace, and VERSION
|
||||
# must be a Cargo-style SemVer Version (must have at least major.minor.patch).
|
||||
#
|
||||
# If PACKAGE_NAME is specified, then the announcement will be for that
|
||||
# package (erroring out if it doesn't have the given version or isn't dist-able).
|
||||
#
|
||||
# If PACKAGE_NAME isn't specified, then the announcement will be for all
|
||||
# (dist-able) packages in the workspace with that version (this mode is
|
||||
# intended for workspaces with only one dist-able package, or with all dist-able
|
||||
# packages versioned/released in lockstep).
|
||||
#
|
||||
# If you push multiple tags at once, separate instances of this workflow will
|
||||
# spin up, creating an independent announcement for each one. However, GitHub
|
||||
# will hard limit this to 3 tags per commit, as it will assume more tags is a
|
||||
# mistake.
|
||||
#
|
||||
# If there's a prerelease-style suffix to the version, then the release(s)
|
||||
# will be marked as a prerelease.
|
||||
on:
|
||||
pull_request:
|
||||
push:
|
||||
@@ -46,9 +10,8 @@ on:
|
||||
- '**[0-9]+.[0-9]+.[0-9]+*'
|
||||
|
||||
jobs:
|
||||
# Run 'dist plan' (or host) to determine what tasks we need to do
|
||||
plan:
|
||||
runs-on: "ubuntu-22.04"
|
||||
runs-on: ubuntu-22.04
|
||||
outputs:
|
||||
val: ${{ steps.plan.outputs.manifest }}
|
||||
tag: ${{ !github.event.pull_request && github.ref_name || '' }}
|
||||
@@ -60,52 +23,36 @@ jobs:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: recursive
|
||||
|
||||
- name: Install dist
|
||||
# we specify bash to get pipefail; it guards against the `curl` command
|
||||
# failing. otherwise `sh` won't catch that `curl` returned non-0
|
||||
shell: bash
|
||||
run: "curl --proto '=https' --tlsv1.2 -LsSf https://github.com/axodotdev/cargo-dist/releases/download/v0.28.0/cargo-dist-installer.sh | sh"
|
||||
run: "curl --proto '=https' --tlsv1.2 -LsSf https://github.com/axodotdev/cargo-dist/releases/download/v0.30.0/cargo-dist-installer.sh | sh"
|
||||
|
||||
- name: Cache dist
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: cargo-dist-cache
|
||||
path: ~/.cargo/bin/dist
|
||||
# sure would be cool if github gave us proper conditionals...
|
||||
# so here's a doubly-nested ternary-via-truthiness to try to provide the best possible
|
||||
# functionality based on whether this is a pull_request, and whether it's from a fork.
|
||||
# (PRs run on the *source* but secrets are usually on the *target* -- that's *good*
|
||||
# but also really annoying to build CI around when it needs secrets to work right.)
|
||||
|
||||
- id: plan
|
||||
run: |
|
||||
dist ${{ (!github.event.pull_request && format('host --steps=create --tag={0}', github.ref_name)) || 'plan' }} --output-format=json > plan-dist-manifest.json
|
||||
echo "dist ran successfully"
|
||||
cat plan-dist-manifest.json
|
||||
echo "manifest=$(jq -c "." plan-dist-manifest.json)" >> "$GITHUB_OUTPUT"
|
||||
- name: "Upload dist-manifest.json"
|
||||
echo "manifest=$(jq -c . plan-dist-manifest.json)" >> "$GITHUB_OUTPUT"
|
||||
|
||||
- name: Upload dist-manifest.json
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: artifacts-plan-dist-manifest
|
||||
path: plan-dist-manifest.json
|
||||
|
||||
# Build and packages all the platform-specific things
|
||||
build-local-artifacts:
|
||||
name: build-local-artifacts (${{ join(matrix.targets, ', ') }})
|
||||
# Let the initial task tell us to not run (currently very blunt)
|
||||
needs:
|
||||
- plan
|
||||
needs: [plan]
|
||||
if: ${{ fromJson(needs.plan.outputs.val).ci.github.artifacts_matrix.include != null && (needs.plan.outputs.publishing == 'true' || fromJson(needs.plan.outputs.val).ci.github.pr_run_mode == 'upload') }}
|
||||
strategy:
|
||||
fail-fast: false
|
||||
# Target platforms/runners are computed by dist in create-release.
|
||||
# Each member of the matrix has the following arguments:
|
||||
#
|
||||
# - runner: the github runner
|
||||
# - dist-args: cli flags to pass to dist
|
||||
# - install-dist: expression to run to install dist on the runner
|
||||
#
|
||||
# Typically there will be:
|
||||
# - 1 "global" task that builds universal installers
|
||||
# - N "local" tasks that build each platform's binaries and platform-specific installers
|
||||
matrix: ${{ fromJson(needs.plan.outputs.val).ci.github.artifacts_matrix }}
|
||||
runs-on: ${{ matrix.runner }}
|
||||
container: ${{ matrix.container && matrix.container.image || null }}
|
||||
@@ -114,11 +61,12 @@ jobs:
|
||||
BUILD_MANIFEST_NAME: target/distrib/${{ join(matrix.targets, '-') }}-dist-manifest.json
|
||||
steps:
|
||||
- name: enable windows longpaths
|
||||
run: |
|
||||
git config --global core.longpaths true
|
||||
run: git config --global core.longpaths true
|
||||
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: recursive
|
||||
|
||||
- name: Install Rust non-interactively if not already installed
|
||||
if: ${{ matrix.container }}
|
||||
run: |
|
||||
@@ -126,37 +74,103 @@ jobs:
|
||||
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
|
||||
echo "$HOME/.cargo/bin" >> $GITHUB_PATH
|
||||
fi
|
||||
|
||||
- name: Install dist
|
||||
run: ${{ matrix.install_dist.run }}
|
||||
# Get the dist-manifest
|
||||
|
||||
- name: Fetch local artifacts
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
pattern: artifacts-*
|
||||
path: target/distrib/
|
||||
merge-multiple: true
|
||||
|
||||
# ===== BEGIN: Injected ORT staging for cargo-dist bundling =====
|
||||
- run: echo "=== BUILD-SETUP START ==="
|
||||
|
||||
# Unix shells
|
||||
- name: Prepare lib dir (Unix)
|
||||
if: runner.os != 'Windows'
|
||||
shell: bash
|
||||
run: |
|
||||
mkdir -p lib
|
||||
rm -f lib/*
|
||||
|
||||
# Windows PowerShell
|
||||
- name: Prepare lib dir (Windows)
|
||||
if: runner.os == 'Windows'
|
||||
shell: pwsh
|
||||
run: |
|
||||
New-Item -ItemType Directory -Force -Path lib | Out-Null
|
||||
# remove contents if any
|
||||
Get-ChildItem -Path lib -Force | Remove-Item -Force -Recurse -ErrorAction SilentlyContinue
|
||||
|
||||
- name: Fetch ONNX Runtime (Linux)
|
||||
if: runner.os == 'Linux'
|
||||
env:
|
||||
ORT_VER: 1.22.0
|
||||
run: |
|
||||
set -euo pipefail
|
||||
ARCH="$(uname -m)"
|
||||
case "$ARCH" in
|
||||
x86_64) URL="https://github.com/microsoft/onnxruntime/releases/download/v${ORT_VER}/onnxruntime-linux-x64-${ORT_VER}.tgz" ;;
|
||||
aarch64) URL="https://github.com/microsoft/onnxruntime/releases/download/v${ORT_VER}/onnxruntime-linux-aarch64-${ORT_VER}.tgz" ;;
|
||||
*) echo "Unsupported arch $ARCH"; exit 1 ;;
|
||||
esac
|
||||
curl -fsSL -o ort.tgz "$URL"
|
||||
tar -xzf ort.tgz
|
||||
cp -v onnxruntime-*/lib/libonnxruntime.so* lib/
|
||||
# normalize to stable name if needed
|
||||
[ -f lib/libonnxruntime.so ] || cp -v lib/libonnxruntime.so.* lib/libonnxruntime.so
|
||||
|
||||
- name: Fetch ONNX Runtime (macOS)
|
||||
if: runner.os == 'macOS'
|
||||
env:
|
||||
ORT_VER: 1.22.0
|
||||
run: |
|
||||
set -euo pipefail
|
||||
curl -fsSL -o ort.tgz "https://github.com/microsoft/onnxruntime/releases/download/v${ORT_VER}/onnxruntime-osx-universal2-${ORT_VER}.tgz"
|
||||
tar -xzf ort.tgz
|
||||
cp -v onnxruntime-*/lib/libonnxruntime*.dylib lib/
|
||||
[ -f lib/libonnxruntime.dylib ] || cp -v lib/libonnxruntime*.dylib lib/libonnxruntime.dylib
|
||||
|
||||
- name: Fetch ONNX Runtime (Windows)
|
||||
if: runner.os == 'Windows'
|
||||
shell: pwsh
|
||||
env:
|
||||
ORT_VER: 1.22.0
|
||||
run: |
|
||||
$url = "https://github.com/microsoft/onnxruntime/releases/download/v$env:ORT_VER/onnxruntime-win-x64-$env:ORT_VER.zip"
|
||||
Invoke-WebRequest $url -OutFile ort.zip
|
||||
Expand-Archive ort.zip -DestinationPath ort
|
||||
$dll = Get-ChildItem -Recurse -Path ort -Filter onnxruntime.dll | Select-Object -First 1
|
||||
Copy-Item $dll.FullName lib\onnxruntime.dll
|
||||
|
||||
- run: |
|
||||
echo "=== BUILD-SETUP END ==="
|
||||
echo "lib/ contents:"
|
||||
ls -l lib || dir lib
|
||||
# ===== END: Injected ORT staging =====
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
${{ matrix.packages_install }}
|
||||
|
||||
- name: Build artifacts
|
||||
run: |
|
||||
# Actually do builds and make zips and whatnot
|
||||
dist build ${{ needs.plan.outputs.tag-flag }} --print=linkage --output-format=json ${{ matrix.dist_args }} > dist-manifest.json
|
||||
echo "dist ran successfully"
|
||||
|
||||
- id: cargo-dist
|
||||
name: Post-build
|
||||
# We force bash here just because github makes it really hard to get values up
|
||||
# to "real" actions without writing to env-vars, and writing to env-vars has
|
||||
# inconsistent syntax between shell and powershell.
|
||||
shell: bash
|
||||
run: |
|
||||
# Parse out what we just built and upload it to scratch storage
|
||||
echo "paths<<EOF" >> "$GITHUB_OUTPUT"
|
||||
dist print-upload-files-from-manifest --manifest dist-manifest.json >> "$GITHUB_OUTPUT"
|
||||
echo "EOF" >> "$GITHUB_OUTPUT"
|
||||
|
||||
cp dist-manifest.json "$BUILD_MANIFEST_NAME"
|
||||
- name: "Upload artifacts"
|
||||
|
||||
- name: Upload artifacts
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: artifacts-build-local-${{ join(matrix.targets, '_') }}
|
||||
@@ -167,16 +181,16 @@ jobs:
|
||||
build_and_push_docker_image:
|
||||
name: Build and Push Docker Image
|
||||
runs-on: ubuntu-latest
|
||||
needs: [plan]
|
||||
if: ${{ needs.plan.outputs.publishing == 'true' }}
|
||||
needs: [plan]
|
||||
if: ${{ needs.plan.outputs.publishing == 'true' }}
|
||||
permissions:
|
||||
contents: read # Permission to checkout the repository
|
||||
packages: write # Permission to push Docker image to GHCR
|
||||
contents: read
|
||||
packages: write
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: recursive # Matches your other checkout steps
|
||||
submodules: recursive
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
@@ -185,33 +199,28 @@ jobs:
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
registry: ghcr.io
|
||||
username: ${{ github.actor }} # User triggering the workflow
|
||||
username: ${{ github.actor }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Extract Docker metadata
|
||||
id: meta
|
||||
uses: docker/metadata-action@v5
|
||||
with:
|
||||
images: ghcr.io/${{ github.repository }}
|
||||
# This action automatically uses the Git tag as the Docker image tag.
|
||||
# For example, a Git tag 'v1.2.3' will result in Docker tag 'ghcr.io/owner/repo:v1.2.3'.
|
||||
images: ghcr.io/${{ github.repository }}
|
||||
|
||||
- name: Build and push Docker image
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: .
|
||||
context: .
|
||||
push: true
|
||||
tags: ${{ steps.meta.outputs.tags }}
|
||||
labels: ${{ steps.meta.outputs.labels }}
|
||||
cache-from: type=gha # Enable Docker layer caching from GitHub Actions cache
|
||||
cache-to: type=gha,mode=max # Enable Docker layer caching to GitHub Actions cache
|
||||
cache-from: type=gha
|
||||
cache-to: type=gha,mode=max
|
||||
|
||||
# Build and package all the platform-agnostic(ish) things
|
||||
build-global-artifacts:
|
||||
needs:
|
||||
- plan
|
||||
- build-local-artifacts
|
||||
runs-on: "ubuntu-22.04"
|
||||
needs: [plan, build-local-artifacts]
|
||||
runs-on: ubuntu-22.04
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
BUILD_MANIFEST_NAME: target/distrib/global-dist-manifest.json
|
||||
@@ -219,92 +228,90 @@ jobs:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: recursive
|
||||
|
||||
- name: Install cached dist
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
name: cargo-dist-cache
|
||||
path: ~/.cargo/bin/
|
||||
- run: chmod +x ~/.cargo/bin/dist
|
||||
# Get all the local artifacts for the global tasks to use (for e.g. checksums)
|
||||
|
||||
- name: Fetch local artifacts
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
pattern: artifacts-*
|
||||
path: target/distrib/
|
||||
merge-multiple: true
|
||||
|
||||
- id: cargo-dist
|
||||
shell: bash
|
||||
run: |
|
||||
dist build ${{ needs.plan.outputs.tag-flag }} --output-format=json "--artifacts=global" > dist-manifest.json
|
||||
echo "dist ran successfully"
|
||||
|
||||
# Parse out what we just built and upload it to scratch storage
|
||||
echo "paths<<EOF" >> "$GITHUB_OUTPUT"
|
||||
jq --raw-output ".upload_files[]" dist-manifest.json >> "$GITHUB_OUTPUT"
|
||||
echo "EOF" >> "$GITHUB_OUTPUT"
|
||||
|
||||
cp dist-manifest.json "$BUILD_MANIFEST_NAME"
|
||||
- name: "Upload artifacts"
|
||||
|
||||
- name: Upload artifacts
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: artifacts-build-global
|
||||
path: |
|
||||
${{ steps.cargo-dist.outputs.paths }}
|
||||
${{ env.BUILD_MANIFEST_NAME }}
|
||||
# Determines if we should publish/announce
|
||||
|
||||
host:
|
||||
needs:
|
||||
- plan
|
||||
- build-local-artifacts
|
||||
- build-global-artifacts
|
||||
# Only run if we're "publishing", and only if local and global didn't fail (skipped is fine)
|
||||
needs: [plan, build-local-artifacts, build-global-artifacts]
|
||||
if: ${{ always() && needs.plan.outputs.publishing == 'true' && (needs.build-global-artifacts.result == 'skipped' || needs.build-global-artifacts.result == 'success') && (needs.build-local-artifacts.result == 'skipped' || needs.build-local-artifacts.result == 'success') }}
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
runs-on: "ubuntu-22.04"
|
||||
runs-on: ubuntu-22.04
|
||||
outputs:
|
||||
val: ${{ steps.host.outputs.manifest }}
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: recursive
|
||||
|
||||
- name: Install cached dist
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
name: cargo-dist-cache
|
||||
path: ~/.cargo/bin/
|
||||
- run: chmod +x ~/.cargo/bin/dist
|
||||
# Fetch artifacts from scratch-storage
|
||||
|
||||
- name: Fetch artifacts
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
pattern: artifacts-*
|
||||
path: target/distrib/
|
||||
merge-multiple: true
|
||||
|
||||
- id: host
|
||||
shell: bash
|
||||
run: |
|
||||
dist host ${{ needs.plan.outputs.tag-flag }} --steps=upload --steps=release --output-format=json > dist-manifest.json
|
||||
echo "artifacts uploaded and released successfully"
|
||||
cat dist-manifest.json
|
||||
echo "manifest=$(jq -c "." dist-manifest.json)" >> "$GITHUB_OUTPUT"
|
||||
- name: "Upload dist-manifest.json"
|
||||
echo "manifest=$(jq -c . dist-manifest.json)" >> "$GITHUB_OUTPUT"
|
||||
|
||||
- name: Upload dist-manifest.json
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
# Overwrite the previous copy
|
||||
name: artifacts-dist-manifest
|
||||
path: dist-manifest.json
|
||||
# Create a GitHub Release while uploading all files to it
|
||||
- name: "Download GitHub Artifacts"
|
||||
|
||||
- name: Download GitHub Artifacts
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
pattern: artifacts-*
|
||||
path: artifacts
|
||||
merge-multiple: true
|
||||
|
||||
- name: Cleanup
|
||||
run: |
|
||||
# Remove the granular manifests
|
||||
rm -f artifacts/*-dist-manifest.json
|
||||
run: rm -f artifacts/*-dist-manifest.json
|
||||
|
||||
- name: Create GitHub Release
|
||||
env:
|
||||
PRERELEASE_FLAG: "${{ fromJson(steps.host.outputs.manifest).announcement_is_prerelease && '--prerelease' || '' }}"
|
||||
@@ -312,20 +319,13 @@ jobs:
|
||||
ANNOUNCEMENT_BODY: "${{ fromJson(steps.host.outputs.manifest).announcement_github_body }}"
|
||||
RELEASE_COMMIT: "${{ github.sha }}"
|
||||
run: |
|
||||
# Write and read notes from a file to avoid quoting breaking things
|
||||
echo "$ANNOUNCEMENT_BODY" > $RUNNER_TEMP/notes.txt
|
||||
|
||||
gh release create "${{ needs.plan.outputs.tag }}" --target "$RELEASE_COMMIT" $PRERELEASE_FLAG --title "$ANNOUNCEMENT_TITLE" --notes-file "$RUNNER_TEMP/notes.txt" artifacts/*
|
||||
|
||||
announce:
|
||||
needs:
|
||||
- plan
|
||||
- host
|
||||
# use "always() && ..." to allow us to wait for all publish jobs while
|
||||
# still allowing individual publish jobs to skip themselves (for prereleases).
|
||||
# "host" however must run to completion, no skipping allowed!
|
||||
needs: [plan, host]
|
||||
if: ${{ always() && needs.host.result == 'success' }}
|
||||
runs-on: "ubuntu-22.04"
|
||||
runs-on: ubuntu-22.04
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
steps:
|
||||
|
||||
4
.gitignore
vendored
4
.gitignore
vendored
@@ -10,6 +10,9 @@ result
|
||||
data
|
||||
database
|
||||
|
||||
evaluations/cache/
|
||||
evaluations/reports/
|
||||
|
||||
# Devenv
|
||||
.devenv*
|
||||
devenv.local.nix
|
||||
@@ -21,3 +24,4 @@ devenv.local.nix
|
||||
.pre-commit-config.yaml
|
||||
# html-router/assets/style.css
|
||||
html-router/node_modules
|
||||
.fastembed_cache/
|
||||
|
||||
84
CHANGELOG.md
Normal file
84
CHANGELOG.md
Normal file
@@ -0,0 +1,84 @@
|
||||
# Changelog
|
||||
## Unreleased
|
||||
|
||||
## 1.0.2 (2026-02-15)
|
||||
- Fix: edge case where navigation back to a chat page could trigger a new response generation
|
||||
- Fix: chat references now validate and render more reliably
|
||||
- Fix: improved admin access checks for restricted routes
|
||||
- Performance: faster chat sidebar loads from cached conversation archive data
|
||||
- API: harmonized ingest endpoint naming and added configurable ingest safety limits
|
||||
- Security: hardened query handling and ingestion logging to reduce injection and data exposure risk
|
||||
|
||||
## 1.0.1 (2026-02-11)
|
||||
- Shipped an S3 storage backend so content can be stored in object storage instead of local disk, with configuration support for S3 deployments.
|
||||
- Introduced user theme preferences with the new Obsidian Prism look and improved dark mode styling.
|
||||
- Fixed edge cases, including content deletion behavior and compatibility for older user records.
|
||||
|
||||
## 1.0.0 (2026-01-02)
|
||||
- **Locally generated embeddings are now default**. If you want to continue using API embeddings, set EMBEDDING_BACKEND to openai. This will download a ONNX model and recreate all embeddings. But in most instances it's very worth it. Removing the network bound call to create embeddings. Creating embeddings on my N100 device is extremely fast. Typically a search response is provided in less than 50ms.
|
||||
- Added a benchmarks create for evaluating the retrieval process
|
||||
- Added fastembed embedding support, enables the use of local CPU generated embeddings, greatly improved latency if machine can handle it. Quick search has vastly better accuracy and is much faster, 50ms latency when testing compared to minimum 300ms.
|
||||
- Embeddings stored on own table.
|
||||
- Refactored retrieval pipeline to use the new, faster and more accurate strategy. Read [blog post](https://blog.stark.pub/posts/eval-retrieval-refactor/) for more details.
|
||||
|
||||
## Version 0.2.7 (2025-12-04)
|
||||
- Improved admin page, now only loads models when specifically requested. Groundwork for coming configuration features.
|
||||
- Fix: timezone aware info in scratchpad
|
||||
|
||||
## Version 0.2.6 (2025-10-29)
|
||||
- Added an opt-in FastEmbed-based reranking stage behind `reranking_enabled`. It improves retrieval accuracy by re-scoring hybrid results.
|
||||
- Fix: default name for relationships harmonized across application
|
||||
|
||||
## Version 0.2.5 (2025-10-24)
|
||||
- Added manual knowledge entity creation flows using a modal, with the option for suggested relationships
|
||||
- Scratchpad feature, with the feature to convert scratchpads to content.
|
||||
- Added knowledge entity search results to the global search
|
||||
- Backend fixes for improved performance when ingesting and retrieval
|
||||
|
||||
## Version 0.2.4 (2025-10-15)
|
||||
- Improved retrieval performance. Ingestion and chat now utilizes full text search, vector comparison and graph traversal.
|
||||
- Ingestion task archive
|
||||
|
||||
## Version 0.2.3 (2025-10-12)
|
||||
- Fix changing vector dimensions on a fresh database (#3)
|
||||
|
||||
## Version 0.2.2 (2025-10-07)
|
||||
- Support for ingestion of PDF files
|
||||
- Improved ingestion speed
|
||||
- Fix deletion of items work as expected
|
||||
- Fix enabling GPT-5 use via OpenAI API
|
||||
|
||||
## Version 0.2.1 (2025-09-24)
|
||||
- Fixed API JSON responses so iOS Shortcuts integrations keep working.
|
||||
|
||||
## Version 0.2.0 (2025-09-23)
|
||||
- Revamped the UI with a neobrutalist theme, better dark mode, and a D3-based knowledge graph.
|
||||
- Added pagination for entities and content plus new observability metrics on the dashboard.
|
||||
- Enabled audio ingestion and merged the new storage backend.
|
||||
- Improved performance, request filtering, and journalctl/systemd compatibility.
|
||||
|
||||
## Version 0.1.4 (2025-07-01)
|
||||
- Added image ingestion with configurable system settings and updated Docker Compose docs.
|
||||
- Hardened admin flows by fixing concurrent API/database calls and normalizing task statuses.
|
||||
|
||||
## Version 0.1.3 (2025-06-08)
|
||||
- Added support for AI providers beyond OpenAI.
|
||||
- Made the HTTP port configurable for deployments.
|
||||
- Smoothed graph mapper failures, long content tiles, and refreshed project documentation.
|
||||
|
||||
## Version 0.1.2 (2025-05-26)
|
||||
- Introduced full-text search across indexed knowledge.
|
||||
- Polished the UI with consistent titles, icon fallbacks, and improved markdown scrolling.
|
||||
- Fixed search result links and SurrealDB vector formatting glitches.
|
||||
|
||||
## Version 0.1.1 (2025-05-13)
|
||||
- Added streaming feedback to ingestion tasks for clearer progress updates.
|
||||
- Made the data storage path configurable.
|
||||
- Improved release tooling with Chromium-enabled Nix flakes, Docker builds, and migration/template fixes.
|
||||
|
||||
## Version 0.1.0 (2025-05-06)
|
||||
- Initial release with a SurrealDB-backed ingestion pipeline, job queue, vector search, and knowledge graph storage.
|
||||
- Delivered a chat experience featuring streaming responses, conversation history, markdown rendering, and customizable system prompts.
|
||||
- Introduced an admin console with analytics, registration and timezone controls, and job monitoring.
|
||||
- Shipped a Tailwind/daisyUI web UI with responsive layouts, modals, content viewers, and editing flows.
|
||||
- Provided readability-based content ingestion, API/HTML ingress routes, and Docker/Docker Compose tooling.
|
||||
3662
Cargo.lock
generated
3662
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
67
Cargo.toml
67
Cargo.toml
@@ -5,14 +5,15 @@ members = [
|
||||
"api-router",
|
||||
"html-router",
|
||||
"ingestion-pipeline",
|
||||
"composite-retrieval",
|
||||
"json-stream-parser"
|
||||
"retrieval-pipeline",
|
||||
"json-stream-parser",
|
||||
"evaluations"
|
||||
]
|
||||
resolver = "2"
|
||||
|
||||
[workspace.dependencies]
|
||||
anyhow = "1.0.94"
|
||||
async-openai = "0.24.1"
|
||||
async-openai = "0.29.3"
|
||||
async-stream = "0.3.6"
|
||||
async-trait = "0.1.88"
|
||||
axum-htmx = "0.7.0"
|
||||
@@ -39,9 +40,11 @@ serde_json = "1.0.128"
|
||||
serde = { version = "1", features = ["derive"] }
|
||||
sha2 = "0.10.8"
|
||||
surrealdb-migrations = "2.2.2"
|
||||
surrealdb = { version = "2", features = ["kv-mem"] }
|
||||
surrealdb = { version = "2" }
|
||||
tempfile = "3.12.0"
|
||||
text-splitter = "0.18.1"
|
||||
text-splitter = { version = "0.18.1", features = ["markdown", "tokenizers"] }
|
||||
tokenizers = { version = "0.20.4", features = ["http"] }
|
||||
unicode-normalization = "0.1.24"
|
||||
thiserror = "1.0.63"
|
||||
tokio-util = { version = "0.7.15", features = ["io"] }
|
||||
tokio = { version = "1", features = ["full"] }
|
||||
@@ -53,9 +56,61 @@ url = { version = "2.5.2", features = ["serde"] }
|
||||
uuid = { version = "1.10.0", features = ["v4", "serde"] }
|
||||
tokio-retry = "0.3.0"
|
||||
base64 = "0.22.1"
|
||||
object_store = { version = "0.11.2" }
|
||||
object_store = { version = "0.11.2", features = ["aws"] }
|
||||
bytes = "1.7.1"
|
||||
state-machines = "0.2.0"
|
||||
fastembed = { version = "5.2.0", default-features = false, features = ["hf-hub-native-tls", "ort-load-dynamic"] }
|
||||
|
||||
[profile.dist]
|
||||
inherits = "release"
|
||||
lto = "thin"
|
||||
|
||||
[workspace.lints.clippy]
|
||||
# Performance-focused lints
|
||||
perf = { level = "warn", priority = -1 }
|
||||
vec_init_then_push = "warn"
|
||||
large_stack_frames = "warn"
|
||||
redundant_allocation = "warn"
|
||||
single_char_pattern = "warn"
|
||||
string_extend_chars = "warn"
|
||||
format_in_format_args = "warn"
|
||||
slow_vector_initialization = "warn"
|
||||
inefficient_to_string = "warn"
|
||||
implicit_clone = "warn"
|
||||
redundant_clone = "warn"
|
||||
|
||||
# Security-focused lints
|
||||
arithmetic_side_effects = "warn"
|
||||
indexing_slicing = "warn"
|
||||
unwrap_used = "warn"
|
||||
expect_used = "warn"
|
||||
panic = "warn"
|
||||
unimplemented = "warn"
|
||||
todo = "warn"
|
||||
|
||||
# Async/Network lints
|
||||
async_yields_async = "warn"
|
||||
await_holding_invalid_type = "warn"
|
||||
rc_buffer = "warn"
|
||||
|
||||
# Maintainability-focused lints
|
||||
cargo = { level = "warn", priority = -1 }
|
||||
pedantic = { level = "warn", priority = -1 }
|
||||
clone_on_ref_ptr = "warn"
|
||||
float_cmp = "warn"
|
||||
manual_string_new = "warn"
|
||||
uninlined_format_args = "warn"
|
||||
unused_self = "warn"
|
||||
must_use_candidate = "allow"
|
||||
missing_errors_doc = "allow"
|
||||
missing_panics_doc = "warn"
|
||||
module_name_repetitions = "warn"
|
||||
wildcard_dependencies = "warn"
|
||||
missing_docs_in_private_items = "warn"
|
||||
|
||||
# Allow noisy lints that don't add value for this project
|
||||
needless_raw_string_hashes = "allow"
|
||||
multiple_bound_locations = "allow"
|
||||
cargo_common_metadata = "allow"
|
||||
multiple-crate-versions = "allow"
|
||||
module_name_repetition = "allow"
|
||||
|
||||
66
Dockerfile
66
Dockerfile
@@ -1,53 +1,51 @@
|
||||
# === Builder Stage ===
|
||||
FROM clux/muslrust:1.86.0-stable as builder
|
||||
|
||||
# === Builder ===
|
||||
FROM rust:1.89-bookworm AS builder
|
||||
WORKDIR /usr/src/minne
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
pkg-config clang cmake git && rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Cache deps
|
||||
COPY Cargo.toml Cargo.lock ./
|
||||
RUN mkdir -p api-router common composite-retrieval html-router ingestion-pipeline json-stream-parser main worker
|
||||
RUN mkdir -p api-router common retrieval-pipeline html-router ingestion-pipeline json-stream-parser main worker
|
||||
COPY api-router/Cargo.toml ./api-router/
|
||||
COPY common/Cargo.toml ./common/
|
||||
COPY composite-retrieval/Cargo.toml ./composite-retrieval/
|
||||
COPY retrieval-pipeline/Cargo.toml ./retrieval-pipeline/
|
||||
COPY html-router/Cargo.toml ./html-router/
|
||||
COPY ingestion-pipeline/Cargo.toml ./ingestion-pipeline/
|
||||
COPY json-stream-parser/Cargo.toml ./json-stream-parser/
|
||||
COPY main/Cargo.toml ./main/
|
||||
RUN cargo build --release --bin main --features ingestion-pipeline/docker || true
|
||||
|
||||
# Build with the MUSL target
|
||||
RUN cargo build --release --target x86_64-unknown-linux-musl --bin main --features ingestion-pipeline/docker || true
|
||||
|
||||
# Copy the rest of the source code
|
||||
# Build
|
||||
COPY . .
|
||||
RUN cargo build --release --bin main --features ingestion-pipeline/docker
|
||||
|
||||
# Build the final application binary with the MUSL target
|
||||
RUN cargo build --release --target x86_64-unknown-linux-musl --bin main --features ingestion-pipeline/docker
|
||||
# === Runtime ===
|
||||
FROM debian:bookworm-slim
|
||||
|
||||
# === Runtime Stage ===
|
||||
FROM alpine:latest
|
||||
# Chromium + runtime deps + OpenMP for ORT
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
chromium libnss3 libasound2 libgbm1 libxshmfence1 \
|
||||
ca-certificates fonts-dejavu fonts-noto-color-emoji \
|
||||
libgomp1 libstdc++6 curl \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
RUN apk update && apk add --no-cache \
|
||||
chromium \
|
||||
nss \
|
||||
freetype \
|
||||
harfbuzz \
|
||||
ca-certificates \
|
||||
ttf-freefont \
|
||||
font-noto-emoji \
|
||||
&& \
|
||||
rm -rf /var/cache/apk/*
|
||||
# ONNX Runtime (CPU). Keep in sync with ort crate requirements.
|
||||
ARG ORT_VERSION=1.23.2
|
||||
RUN mkdir -p /opt/onnxruntime && \
|
||||
curl -fsSL -o /tmp/ort.tgz \
|
||||
"https://github.com/microsoft/onnxruntime/releases/download/v${ORT_VERSION}/onnxruntime-linux-x64-${ORT_VERSION}.tgz" && \
|
||||
tar -xzf /tmp/ort.tgz -C /opt/onnxruntime --strip-components=1 && rm /tmp/ort.tgz
|
||||
|
||||
ENV CHROME_BIN=/usr/bin/chromium-browser \
|
||||
CHROME_PATH=/usr/lib/chromium/ \
|
||||
SSL_CERT_FILE=/etc/ssl/certs/ca-certificates.crt
|
||||
ENV CHROME_BIN=/usr/bin/chromium \
|
||||
SSL_CERT_FILE=/etc/ssl/certs/ca-certificates.crt \
|
||||
ORT_DYLIB_PATH=/opt/onnxruntime/lib/libonnxruntime.so
|
||||
|
||||
# Create a non-root user to run the application
|
||||
RUN adduser -D -h /home/appuser appuser
|
||||
WORKDIR /home/appuser
|
||||
# Non-root
|
||||
RUN useradd -m appuser
|
||||
USER appuser
|
||||
WORKDIR /home/appuser
|
||||
|
||||
# Copy the compiled binary from the builder stage (note the target path)
|
||||
COPY --from=builder /usr/src/minne/target/x86_64-unknown-linux-musl/release/main /usr/local/bin/main
|
||||
|
||||
COPY --from=builder /usr/src/minne/target/release/main /usr/local/bin/main
|
||||
EXPOSE 3000
|
||||
# EXPOSE 8000-9000
|
||||
|
||||
CMD ["main"]
|
||||
|
||||
402
README.md
402
README.md
@@ -6,201 +6,148 @@
|
||||
[](https://www.gnu.org/licenses/agpl-3.0)
|
||||
[](https://github.com/perstarkse/minne/releases/latest)
|
||||
|
||||

|
||||

|
||||

|
||||
|
||||
## Demo deployment
|
||||
|
||||
To test _Minne_ out, enter [this](https://minne-demo.stark.pub) read-only demo deployment to view and test functionality out.
|
||||
To test _Minne_ out, enter [this](https://minne.stark.pub) and sign in to a read-only demo deployment to view and test functionality out.
|
||||
|
||||
## Noteworthy Features
|
||||
|
||||
- **Search & Chat Interface** - Find content or knowledge instantly with full-text search, or use the chat mode and conversational AI to find and reason about content
|
||||
- **Manual and AI-assisted connections** - Build entities and relationships manually with full control, let AI create entities and relationships automatically, or blend both approaches with AI suggestions for manual approval
|
||||
- **Hybrid Retrieval System** - Search combining vector similarity & full-text search
|
||||
- **Scratchpad Feature** - Quickly capture thoughts and convert them to permanent content when ready
|
||||
- **Visual Graph Explorer** - Interactive D3-based navigation of your knowledge entities and connections
|
||||
- **Multi-Format Support** - Ingest text, URLs, PDFs, audio files, and images into your knowledge base
|
||||
- **Performance Focus** - Built with Rust and server-side rendering for speed and efficiency
|
||||
- **Self-Hosted & Privacy-Focused** - Full control over your data, and compatible with any OpenAI-compatible API that supports structured outputs
|
||||
|
||||
## The "Why" Behind Minne
|
||||
|
||||
For a while I've been fascinated by Zettelkasten-style PKM systems. While tools like Logseq and Obsidian are excellent, I found the manual linking process to be a hindrance for me. I also wanted a centralized storage and easy access across devices.
|
||||
For a while I've been fascinated by personal knowledge management systems. I wanted something that made it incredibly easy to capture content - snippets of text, URLs, and other media - while automatically discovering connections between ideas. But I also wanted to maintain control over my knowledge structure.
|
||||
|
||||
While developing Minne, I discovered [KaraKeep](https://karakeep.com/) (formerly Hoarder), which is an excellent application in a similar space – you probably want to check it out! However, if you're interested in a PKM that builds an automatic network between related concepts using AI, offers search and the **possibility to chat with your knowledge resource**, and provides a blend of manual and AI-driven organization, then Minne might be worth testing.
|
||||
Traditional tools like Logseq and Obsidian are excellent, but the manual linking process often became a hindrance. Meanwhile, fully automated systems sometimes miss important context or create relationships I wouldn't have chosen myself.
|
||||
|
||||
## Core Philosophy & Features
|
||||
So I built Minne to offer the best of both worlds: effortless content capture with AI-assisted relationship discovery, but with the flexibility to manually curate, edit, or override any connections. You can let AI handle the heavy lifting of extracting entities and finding relationships, take full control yourself, or use a hybrid approach where AI suggests connections that you can approve or modify.
|
||||
|
||||
Minne is designed to make it incredibly easy to save snippets of text, URLs, and other content (limited, pending demand). Simply send content along with a category tag. Minne then ingests this, leveraging AI to create relevant nodes and relationships within its graph database, alongside your manual categorization. This graph backend allows for discoverable connections between your pieces of knowledge.
|
||||
While developing Minne, I discovered [KaraKeep](https://github.com/karakeep-app/karakeep) (formerly Hoarder), which is an excellent application in a similar space – you probably want to check it out! However, if you're interested in a PKM that offers both intelligent automation and manual curation, with the ability to chat with your knowledge base, then Minne might be worth testing.
|
||||
|
||||
You can converse with your knowledge base through an LLM-powered chat interface (via OpenAI compatible API, like Ollama or others). For those who like to see the bigger picture, Minne also includes an **experimental feature to visually explore your knowledge graph.**
|
||||
## Table of Contents
|
||||
|
||||
You may switch and choose between models used, and have the possiblity to change the prompts to your liking. There is since release **0.1.3** the option to change embeddings length, making it easy to test another embedding model.
|
||||
- [Quick Start](#quick-start)
|
||||
- [Features in Detail](#features-in-detail)
|
||||
- [Configuration](#configuration)
|
||||
- [Tech Stack](#tech-stack)
|
||||
- [Application Architecture](#application-architecture)
|
||||
- [AI Configuration](#ai-configuration--model-selection)
|
||||
- [Roadmap](#roadmap)
|
||||
- [Development](#development)
|
||||
- [Contributing](#contributing)
|
||||
- [License](#license)
|
||||
|
||||
The application is built for speed and efficiency using Rust with a Server-Side Rendered (SSR) frontend (HTMX and minimal JavaScript). It's fully responsive, offering a complete mobile interface for reading, editing, and managing your content, including the graph database itself. **PWA (Progressive Web App) support** means you can "install" Minne to your device for a native-like experience. For quick capture on the go on iOS, a [**Shortcut**](https://www.icloud.com/shortcuts/9aa960600ec14329837ba4169f57a166) makes sending content to your Minne instance a breeze.
|
||||
## Quick Start
|
||||
|
||||
Minne is open source (AGPL), self-hostable, and can be deployed flexibly: via Nix, Docker Compose, pre-built binaries, or by building from source. It can run as a single `main` binary or as separate `server` and `worker` processes for optimized resource allocation.
|
||||
The fastest way to get Minne running is with Docker Compose:
|
||||
|
||||
```bash
|
||||
# Clone the repository
|
||||
git clone https://github.com/perstarkse/minne.git
|
||||
cd minne
|
||||
|
||||
# Start Minne and its database
|
||||
docker compose up -d
|
||||
|
||||
# Access at http://localhost:3000
|
||||
```
|
||||
|
||||
**Required Setup:**
|
||||
- Replace `your_openai_api_key_here` in `docker-compose.yml` with your actual API key
|
||||
- Configure `OPENAI_BASE_URL` if using a custom AI provider (like Ollama)
|
||||
|
||||
For detailed installation options, see [Configuration](#configuration).
|
||||
|
||||
## Features in Detail
|
||||
|
||||
### Search vs. Chat mode
|
||||
|
||||
**Search** - Use when you know roughly what you're looking for. Full-text search finds items quickly by matching your query terms.
|
||||
|
||||
**Chat Mode** - Use when you want to explore concepts, find connections, or reason about your knowledge. The AI analyzes your query and finds relevant context across your entire knowledge base.
|
||||
|
||||
### Content Processing
|
||||
|
||||
Minne automatically processes content you save:
|
||||
1. **Web scraping** extracts readable text from URLs
|
||||
2. **Text analysis** identifies key concepts and relationships
|
||||
3. **Graph creation** builds connections between related content
|
||||
4. **Embedding generation** enables semantic search capabilities
|
||||
|
||||
### Visual Knowledge Graph
|
||||
|
||||
Explore your knowledge as an interactive network with flexible curation options:
|
||||
|
||||
**Manual Curation** - Create knowledge entities and relationships yourself with full control over your graph structure
|
||||
|
||||
**AI Automation** - Let AI automatically extract entities and discover relationships from your content
|
||||
|
||||
**Hybrid Approach** - Get AI-suggested relationships and entities that you can manually review, edit, or approve
|
||||
|
||||
The graph visualization shows:
|
||||
- Knowledge entities as nodes (manually created or AI-extracted)
|
||||
- Relationships as connections (manually defined, AI-discovered, or suggested)
|
||||
- Interactive navigation for discovery and editing
|
||||
|
||||
### Optional FastEmbed Reranking
|
||||
|
||||
Minne ships with an opt-in reranking stage powered by [fastembed-rs](https://github.com/Anush008/fastembed-rs). When enabled, the hybrid retrieval results are rescored with a lightweight cross-encoder before being returned to chat or ingestion flows. In practice this often means more relevant results, boosting answer quality and downstream enrichment.
|
||||
|
||||
⚠️ **Resource notes**
|
||||
- Enabling reranking downloads and caches ~1.1 GB of model data on first startup (cached under `<data_dir>/fastembed/reranker` by default).
|
||||
- Initialization takes longer while warming the cache, and each query consumes extra CPU. The default pool size (2) is tuned for a singe user setup, but could work with a pool size on 1 as well.
|
||||
- The feature is disabled by default. Set `reranking_enabled: true` (or `RERANKING_ENABLED=true`) if you’re comfortable with the additional footprint.
|
||||
|
||||
Example configuration:
|
||||
|
||||
```yaml
|
||||
reranking_enabled: true
|
||||
reranking_pool_size: 2
|
||||
fastembed_cache_dir: "/var/lib/minne/fastembed" # optional override, defaults to .fastembed_cache
|
||||
```
|
||||
|
||||
## Tech Stack
|
||||
|
||||
- **Backend:** Rust. Server-Side Rendering (SSR). Axum. Minijinja for templating.
|
||||
- **Frontend:** HTML. HTMX and plain JavaScript for interactivity.
|
||||
- **Database:** SurrealDB
|
||||
- **AI Integration:** OpenAI API compatible endpoint (for chat and content processing), with support for structured outputs.
|
||||
- **Web Content Processing:** Relies on a Chromium instance for robust webpage fetching/rendering.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- **For Docker/Nix:** Docker or Nix installed. These methods handle SurrealDB and Chromium dependencies.
|
||||
- **For Binaries/Source:**
|
||||
- A running SurrealDB instance.
|
||||
- Chromium (or a compatible Chrome browser) installed and accessible in your `PATH`.
|
||||
- Git (if cloning and building from source).
|
||||
- Rust toolchain (if building from source).
|
||||
|
||||
## Getting Started
|
||||
|
||||
You have several options to get Minne up and running:
|
||||
|
||||
### 1. Nix (Recommended for ease of dependency management)
|
||||
|
||||
If you have Nix installed, you can run Minne directly:
|
||||
|
||||
```bash
|
||||
nix run 'github:perstarkse/minne#main'
|
||||
```
|
||||
|
||||
This command will fetch Minne and its dependencies (including Chromium) and run the `main` (combined server/worker) application.
|
||||
|
||||
### 2. Docker Compose (Recommended for containerized environments)
|
||||
|
||||
This is a great way to manage Minne and its SurrealDB dependency together.
|
||||
|
||||
1. Clone the repository (or just save the `docker-compose.yml` below).
|
||||
|
||||
1. Create a `docker-compose.yml` file:
|
||||
|
||||
```yaml
|
||||
version: "3.8"
|
||||
services:
|
||||
minne:
|
||||
image: ghcr.io/perstarkse/minne:latest # Pulls the latest pre-built image
|
||||
# Or, to build from local source:
|
||||
# build: .
|
||||
container_name: minne_app
|
||||
ports:
|
||||
- "3000:3000" # Exposes Minne on port 3000
|
||||
environment:
|
||||
# These are examples, ensure they match your SurrealDB setup below
|
||||
# and your actual OpenAI key.
|
||||
SURREALDB_ADDRESS: "ws://surrealdb:8000"
|
||||
SURREALDB_USERNAME: "root_user" # Default from SurrealDB service below
|
||||
SURREALDB_PASSWORD: "root_password" # Default from SurrealDB service below
|
||||
SURREALDB_DATABASE: "minne_db"
|
||||
SURREALDB_NAMESPACE: "minne_ns"
|
||||
OPENAI_API_KEY: "your_openai_api_key_here" # IMPORTANT: Replace with your actual key
|
||||
#OPENAI_BASE_URL: "your_ollama_address" # Uncomment this and change it to override the default openai base url
|
||||
HTTP_PORT: 3000
|
||||
DATA_DIR: "/data" # Data directory inside the container
|
||||
RUST_LOG: "minne=info,tower_http=info" # Example logging level
|
||||
volumes:
|
||||
- ./minne_data:/data # Persists Minne's data (e.g., scraped content) on the host
|
||||
depends_on:
|
||||
- surrealdb
|
||||
networks:
|
||||
- minne-net
|
||||
# Waits for SurrealDB to be ready before starting Minne
|
||||
command: >
|
||||
sh -c "
|
||||
echo 'Waiting for SurrealDB to start...' &&
|
||||
# Adjust sleep time if SurrealDB takes longer to initialize in your environment
|
||||
until nc -z surrealdb 8000; do echo 'Waiting for SurrealDB...'; sleep 2; done &&
|
||||
echo 'SurrealDB is up, starting Minne application...' &&
|
||||
/usr/local/bin/main
|
||||
"
|
||||
# For separate server/worker:
|
||||
# command: /usr/local/bin/server # or /usr/local/bin/worker
|
||||
|
||||
surrealdb:
|
||||
image: surrealdb/surrealdb:latest
|
||||
container_name: minne_surrealdb
|
||||
ports:
|
||||
# Exposes SurrealDB on port 8000 (primarily for direct access/debugging if needed,
|
||||
# not strictly required for Minne if only accessed internally by the minne service)
|
||||
- "127.0.0.1:8000:8000" # Bind to localhost only for SurrealDB by default
|
||||
volumes:
|
||||
# Persists SurrealDB data on the host in a 'surreal_database' folder
|
||||
- ./surreal_database:/database
|
||||
command: >
|
||||
start
|
||||
--log info # Consider 'debug' for troubleshooting
|
||||
--user root_user
|
||||
--pass root_password
|
||||
file:/database/minne_v1.db # Using file-based storage for simplicity
|
||||
networks:
|
||||
- minne-net
|
||||
|
||||
volumes:
|
||||
minne_data: {} # Defines a named volume for Minne data (can be managed by Docker)
|
||||
surreal_database: {} # Defines a named volume for SurrealDB data
|
||||
|
||||
networks:
|
||||
minne-net:
|
||||
driver: bridge
|
||||
```
|
||||
|
||||
1. Run:
|
||||
|
||||
```bash
|
||||
docker compose up -d
|
||||
```
|
||||
|
||||
Minne will be accessible at `http://localhost:3000`.
|
||||
|
||||
### 3. Pre-built Binaries (GitHub Releases)
|
||||
|
||||
Binaries for Windows, macOS, and Linux (combined `main` version) are available on the [GitHub Releases page](https://github.com/perstarkse/minne/releases/latest).
|
||||
|
||||
1. Download the appropriate binary for your system.
|
||||
1. **You will need to provide and run SurrealDB and have Chromium installed and accessible in your PATH separately.**
|
||||
1. Set the required [Configuration](#configuration) environment variables or use a `config.yaml`.
|
||||
1. Run the executable.
|
||||
|
||||
### 4. Build from Source
|
||||
|
||||
1. Clone the repository:
|
||||
```bash
|
||||
git clone https://github.com/perstarkse/minne.git
|
||||
cd minne
|
||||
```
|
||||
1. **You will need to provide and run SurrealDB and have Chromium installed and accessible in your PATH separately.**
|
||||
1. Set the required [Configuration](#configuration) environment variables or use a `config.yaml`.
|
||||
1. Build and run:
|
||||
- For the combined `main` binary:
|
||||
```bash
|
||||
cargo run --release --bin main
|
||||
```
|
||||
- For the `server` binary:
|
||||
```bash
|
||||
cargo run --release --bin server
|
||||
```
|
||||
- For the `worker` binary (if you want to run it separately):
|
||||
```bash
|
||||
cargo run --release --bin worker
|
||||
```
|
||||
The compiled binaries will be in `target/release/`.
|
||||
- **Backend:** Rust with Axum framework and Server-Side Rendering (SSR)
|
||||
- **Frontend:** HTML with HTMX and minimal JavaScript for interactivity
|
||||
- **Database:** SurrealDB (graph, document, and vector search)
|
||||
- **AI Integration:** OpenAI-compatible API with structured outputs
|
||||
- **Web Processing:** Headless Chrome for robust webpage content extraction
|
||||
|
||||
## Configuration
|
||||
|
||||
Minne can be configured using environment variables or a `config.yaml` file placed in the working directory where you run the application. Environment variables take precedence over `config.yaml`.
|
||||
Minne can be configured using environment variables or a `config.yaml` file. Environment variables take precedence over `config.yaml`.
|
||||
|
||||
**Required Configuration:**
|
||||
### Required Configuration
|
||||
|
||||
- `SURREALDB_ADDRESS`: WebSocket address of your SurrealDB instance (e.g., `ws://127.0.0.1:8000` or `ws://surrealdb:8000` for Docker).
|
||||
- `SURREALDB_USERNAME`: Username for SurrealDB (e.g., `root_user`).
|
||||
- `SURREALDB_PASSWORD`: Password for SurrealDB (e.g., `root_password`).
|
||||
- `SURREALDB_DATABASE`: Database name in SurrealDB (e.g., `minne_db`).
|
||||
- `SURREALDB_NAMESPACE`: Namespace in SurrealDB (e.g., `minne_ns`).
|
||||
- `OPENAI_API_KEY`: Your API key for OpenAI (e.g., `sk-YourActualOpenAIKeyGoesHere`).
|
||||
- `HTTP_PORT`: Port for the Minne server to listen on (Default: `3000`).
|
||||
- `SURREALDB_ADDRESS`: WebSocket address of your SurrealDB instance (e.g., `ws://127.0.0.1:8000`)
|
||||
- `SURREALDB_USERNAME`: Username for SurrealDB (e.g., `root_user`)
|
||||
- `SURREALDB_PASSWORD`: Password for SurrealDB (e.g., `root_password`)
|
||||
- `SURREALDB_DATABASE`: Database name in SurrealDB (e.g., `minne_db`)
|
||||
- `SURREALDB_NAMESPACE`: Namespace in SurrealDB (e.g., `minne_ns`)
|
||||
- `OPENAI_API_KEY`: Your API key for OpenAI compatible endpoint
|
||||
- `HTTP_PORT`: Port for the Minne server (Default: `3000`)
|
||||
|
||||
**Optional Configuration:**
|
||||
### Optional Configuration
|
||||
|
||||
- `RUST_LOG`: Controls logging level (e.g., `minne=info,tower_http=debug`).
|
||||
- `DATA_DIR`: Directory to store local data like fetched webpage content (e.g., `./data`).
|
||||
- `OPENAI_BASE_URL`: Base URL to a OpenAI API provider, such as Ollama.
|
||||
- `RUST_LOG`: Controls logging level (e.g., `minne=info,tower_http=debug`)
|
||||
- `DATA_DIR`: Directory to store local data (e.g., `./data`)
|
||||
- `OPENAI_BASE_URL`: Base URL for custom AI providers (like Ollama)
|
||||
- `RERANKING_ENABLED` / `reranking_enabled`: Set to `true` to enable the FastEmbed reranking stage (default `false`)
|
||||
- `RERANKING_POOL_SIZE` / `reranking_pool_size`: Maximum concurrent reranker workers (defaults to `2`)
|
||||
- `FASTEMBED_CACHE_DIR` / `fastembed_cache_dir`: Directory for cached FastEmbed models (defaults to `<data_dir>/fastembed/reranker`)
|
||||
- `FASTEMBED_SHOW_DOWNLOAD_PROGRESS` / `fastembed_show_download_progress`: Show model download progress when warming the cache (default `true`)
|
||||
|
||||
**Example `config.yaml`:**
|
||||
### Example config.yaml
|
||||
|
||||
```yaml
|
||||
surrealdb_address: "ws://127.0.0.1:8000"
|
||||
@@ -210,70 +157,109 @@ surrealdb_database: "minne_db"
|
||||
surrealdb_namespace: "minne_ns"
|
||||
openai_api_key: "sk-YourActualOpenAIKeyGoesHere"
|
||||
data_dir: "./minne_app_data"
|
||||
http_port: 3000
|
||||
# rust_log: "info"
|
||||
# http_port: 3000
|
||||
```
|
||||
|
||||
## Application Architecture (Binaries)
|
||||
## Installation Options
|
||||
|
||||
Minne offers flexibility in deployment:
|
||||
### 1. Docker Compose (Recommended)
|
||||
|
||||
- **`main`**: A combined binary running both server (API, web UI) and worker (background tasks) in one process. Ideal for simpler setups.
|
||||
- **`server`**: Runs only the server component.
|
||||
- **`worker`**: Runs only the worker component, suitable for deployment on a machine with more resources for intensive tasks.
|
||||
```bash
|
||||
# Clone and run
|
||||
git clone https://github.com/perstarkse/minne.git
|
||||
cd minne
|
||||
docker compose up -d
|
||||
```
|
||||
|
||||
This modularity allows scaling and resource optimization. The `main` binary or the Docker Compose setup (using `main`) is sufficient for most users.
|
||||
The included `docker-compose.yml` handles SurrealDB and Chromium dependencies automatically.
|
||||
|
||||
### 2. Nix
|
||||
|
||||
```bash
|
||||
nix run 'github:perstarkse/minne#main'
|
||||
```
|
||||
|
||||
This fetches Minne and all dependencies, including Chromium.
|
||||
|
||||
### 3. Pre-built Binaries
|
||||
|
||||
Download binaries for Windows, macOS, and Linux from the [GitHub Releases](https://github.com/perstarkse/minne/releases/latest).
|
||||
|
||||
**Requirements:** You'll need to provide SurrealDB and Chromium separately.
|
||||
|
||||
### 4. Build from Source
|
||||
|
||||
```bash
|
||||
git clone https://github.com/perstarkse/minne.git
|
||||
cd minne
|
||||
cargo run --release --bin main
|
||||
```
|
||||
|
||||
**Requirements:** SurrealDB and Chromium must be installed and accessible in your PATH.
|
||||
|
||||
## Application Architecture
|
||||
|
||||
Minne offers flexible deployment options:
|
||||
|
||||
- **`main`**: Combined server and worker in one process (recommended for most users)
|
||||
- **`server`**: Web interface and API only
|
||||
- **`worker`**: Background processing only (for resource optimization)
|
||||
|
||||
## Usage
|
||||
|
||||
Once Minne is running:
|
||||
Once Minne is running at `http://localhost:3000`:
|
||||
|
||||
1. Access the web interface at `http://localhost:3000` (or your configured port).
|
||||
1. On iOS, consider setting up the [Minne iOS Shortcut](https://www.icloud.com/shortcuts/9aa960600ec14329837ba4169f57a166) for effortless content sending. **Add the shortcut, replace the [insert_url] and the [insert_api_key] snippets**.
|
||||
1. Add notes, URLs, **audio files**, and explore your growing knowledge graph.
|
||||
1. Engage with the chat interface to query your saved content.
|
||||
1. Try the experimental visual graph explorer to see connections.
|
||||
1. **Web Interface**: Full-featured experience for desktop and mobile
|
||||
2. **iOS Shortcut**: Use the [Minne iOS Shortcut](https://www.icloud.com/shortcuts/e433fbd7602f4e2eaa70dca162323477) for quick content capture
|
||||
3. **Content Types**: Save notes, URLs, audio files, and more
|
||||
4. **Knowledge Graph**: Explore automatic connections between your content
|
||||
5. **Chat Interface**: Query your knowledge base conversationally
|
||||
|
||||
## AI Configuration & Model Selection
|
||||
|
||||
Minne relies on an OpenAI-compatible API for processing content, generating graph relationships, and powering the chat feature.
|
||||
### Setting Up AI Providers
|
||||
|
||||
**Environment Variables / `config.yaml` keys:**
|
||||
Minne uses OpenAI-compatible APIs. Configure via environment variables or `config.yaml`:
|
||||
|
||||
- `OPENAI_API_KEY` (required): Your API key for the chosen AI provider.
|
||||
- `OPENAI_BASE_URL` (optional): Use this to override the default OpenAI API URL (`https://api.openai.com/v1`). This is essential for using local models via services like Ollama, or other API providers.
|
||||
- **Example for Ollama:** `http://<your-ollama-ip>:11434/v1`
|
||||
- `OPENAI_API_KEY` (required): Your API key
|
||||
- `OPENAI_BASE_URL` (optional): Custom provider URL (e.g., Ollama: `http://localhost:11434/v1`)
|
||||
|
||||
### Changing Models
|
||||
### Model Selection
|
||||
|
||||
Once you have configured the `OPENAI_BASE_URL` to point to your desired provider, you can select the specific models Minne should use.
|
||||
|
||||
1. Navigate to the `/admin` page in your Minne instance.
|
||||
1. The page will list the models available from your configured endpoint. You can select different models for processing content and for chat.
|
||||
1. **Important:** For content processing, Minne relies on structured outputs (function calling). The model and provider you select for this task **must** support this feature.
|
||||
1. **Embedding Dimensions:** If you change the embedding model, you **must** update the "Embedding Dimensions" setting in the admin panel to match the output dimensions of your new model (e.g., `text-embedding-3-small` uses 1536, `nomic-embed-text` uses 768). Mismatched dimensions will cause errors. Some newer models will accept a dimension argument, and for these setting the dimensions to whatever should work.
|
||||
1. Access the `/admin` page in your Minne instance
|
||||
2. Select models for content processing and chat from your configured provider
|
||||
3. **Content Processing Requirements**: The model must support structured outputs
|
||||
4. **Embedding Dimensions**: Update this setting when changing embedding models (e.g., 1536 for `text-embedding-3-small`, 768 for `nomic-embed-text`)
|
||||
|
||||
## Roadmap
|
||||
|
||||
I've developed Minne primarily for my own use, but having been in the selfhosted space for a long time, and using the efforts by others, I thought I'd share with the community. Feature requests are welcome.
|
||||
The roadmap as of now is:
|
||||
Current development focus:
|
||||
|
||||
- Handle uploaded images wisely.
|
||||
- An updated explorer of the graph database.
|
||||
- A TUI frontend which opens your system default editor for improved writing and document management.
|
||||
- TUI frontend with system editor integration
|
||||
- Enhanced reranking for improved retrieval recall
|
||||
- Additional content type support
|
||||
|
||||
## Contributing
|
||||
|
||||
Contributions are welcome! Whether it's bug reports, feature suggestions, documentation improvements, or code contributions, please feel free to open an issue or submit a pull request.
|
||||
Feature requests and contributions are welcome!
|
||||
|
||||
## Development
|
||||
|
||||
Run test with
|
||||
```rust
|
||||
```bash
|
||||
# Run tests
|
||||
cargo test
|
||||
|
||||
# Development build
|
||||
cargo build
|
||||
|
||||
# Comprehensive linting
|
||||
cargo clippy --workspace --all-targets --all-features
|
||||
```
|
||||
There is currently a variety of unit tests for commonly used functions. Additional tests, especially integration tests would be very welcome.
|
||||
|
||||
The codebase includes extensive unit tests. Integration tests and additional contributions are welcome.
|
||||
|
||||
## Contributing
|
||||
I've developed Minne primarily for my own use, but having been in the selfhosted space for a long time, and using the efforts by others, I thought I'd share with the community. Feature requests are welcome.
|
||||
|
||||
## License
|
||||
|
||||
Minne is licensed under the **GNU Affero General Public License v3.0 (AGPL-3.0)**. See the [LICENSE](LICENSE) file for details. This means if you run a modified version of Minne as a network service, you must also offer the source code of that modified version to its users.
|
||||
Minne is licensed under the **GNU Affero General Public License v3.0 (AGPL-3.0)**. See the [LICENSE](LICENSE) file for details.
|
||||
|
||||
@@ -4,6 +4,9 @@ version = "0.1.0"
|
||||
edition = "2021"
|
||||
license = "AGPL-3.0-or-later"
|
||||
|
||||
[lints]
|
||||
workspace = true
|
||||
|
||||
[dependencies]
|
||||
tokio = { workspace = true }
|
||||
serde = { workspace = true }
|
||||
|
||||
@@ -1,15 +1,22 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use common::{storage::db::SurrealDbClient, utils::config::AppConfig};
|
||||
use common::{
|
||||
storage::{db::SurrealDbClient, store::StorageManager},
|
||||
utils::config::AppConfig,
|
||||
};
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct ApiState {
|
||||
pub db: Arc<SurrealDbClient>,
|
||||
pub config: AppConfig,
|
||||
pub storage: StorageManager,
|
||||
}
|
||||
|
||||
impl ApiState {
|
||||
pub async fn new(config: &AppConfig) -> Result<Self, Box<dyn std::error::Error>> {
|
||||
pub async fn new(
|
||||
config: &AppConfig,
|
||||
storage: StorageManager,
|
||||
) -> Result<Self, Box<dyn std::error::Error>> {
|
||||
let surreal_db_client = Arc::new(
|
||||
SurrealDbClient::new(
|
||||
&config.surrealdb_address,
|
||||
@@ -23,9 +30,10 @@ impl ApiState {
|
||||
|
||||
surreal_db_client.apply_migrations().await?;
|
||||
|
||||
let app_state = ApiState {
|
||||
let app_state = Self {
|
||||
db: surreal_db_client.clone(),
|
||||
config: config.clone(),
|
||||
storage,
|
||||
};
|
||||
|
||||
Ok(app_state)
|
||||
|
||||
@@ -20,6 +20,9 @@ pub enum ApiError {
|
||||
|
||||
#[error("Unauthorized: {0}")]
|
||||
Unauthorized(String),
|
||||
|
||||
#[error("Payload too large: {0}")]
|
||||
PayloadTooLarge(String),
|
||||
}
|
||||
|
||||
impl From<AppError> for ApiError {
|
||||
@@ -27,46 +30,53 @@ impl From<AppError> for ApiError {
|
||||
match err {
|
||||
AppError::Database(_) | AppError::OpenAI(_) => {
|
||||
tracing::error!("Internal error: {:?}", err);
|
||||
ApiError::InternalError("Internal server error".to_string())
|
||||
Self::InternalError("Internal server error".to_string())
|
||||
}
|
||||
AppError::NotFound(msg) => ApiError::NotFound(msg),
|
||||
AppError::Validation(msg) => ApiError::ValidationError(msg),
|
||||
AppError::Auth(msg) => ApiError::Unauthorized(msg),
|
||||
_ => ApiError::InternalError("Internal server error".to_string()),
|
||||
AppError::NotFound(msg) => Self::NotFound(msg),
|
||||
AppError::Validation(msg) => Self::ValidationError(msg),
|
||||
AppError::Auth(msg) => Self::Unauthorized(msg),
|
||||
_ => Self::InternalError("Internal server error".to_string()),
|
||||
}
|
||||
}
|
||||
}
|
||||
impl IntoResponse for ApiError {
|
||||
fn into_response(self) -> Response {
|
||||
let (status, error_response) = match self {
|
||||
ApiError::InternalError(message) => (
|
||||
Self::InternalError(message) => (
|
||||
StatusCode::INTERNAL_SERVER_ERROR,
|
||||
ErrorResponse {
|
||||
error: message,
|
||||
status: "error".to_string(),
|
||||
},
|
||||
),
|
||||
ApiError::ValidationError(message) => (
|
||||
Self::ValidationError(message) => (
|
||||
StatusCode::BAD_REQUEST,
|
||||
ErrorResponse {
|
||||
error: message,
|
||||
status: "error".to_string(),
|
||||
},
|
||||
),
|
||||
ApiError::NotFound(message) => (
|
||||
Self::NotFound(message) => (
|
||||
StatusCode::NOT_FOUND,
|
||||
ErrorResponse {
|
||||
error: message,
|
||||
status: "error".to_string(),
|
||||
},
|
||||
),
|
||||
ApiError::Unauthorized(message) => (
|
||||
Self::Unauthorized(message) => (
|
||||
StatusCode::UNAUTHORIZED,
|
||||
ErrorResponse {
|
||||
error: message,
|
||||
status: "error".to_string(),
|
||||
},
|
||||
),
|
||||
Self::PayloadTooLarge(message) => (
|
||||
StatusCode::PAYLOAD_TOO_LARGE,
|
||||
ErrorResponse {
|
||||
error: message,
|
||||
status: "error".to_string(),
|
||||
},
|
||||
),
|
||||
};
|
||||
|
||||
(status, Json(error_response)).into_response()
|
||||
@@ -132,6 +142,10 @@ mod tests {
|
||||
// Test unauthorized status
|
||||
let error = ApiError::Unauthorized("not allowed".to_string());
|
||||
assert_status_code(error, StatusCode::UNAUTHORIZED);
|
||||
|
||||
// Test payload too large status
|
||||
let error = ApiError::PayloadTooLarge("too big".to_string());
|
||||
assert_status_code(error, StatusCode::PAYLOAD_TOO_LARGE);
|
||||
}
|
||||
|
||||
// Alternative approach that doesn't try to parse the response body
|
||||
|
||||
@@ -6,7 +6,7 @@ use axum::{
|
||||
Router,
|
||||
};
|
||||
use middleware_api_auth::api_auth;
|
||||
use routes::{categories::get_categories, ingress::ingest_data, liveness::live, readiness::ready};
|
||||
use routes::{categories::get_categories, ingest::ingest_data, liveness::live, readiness::ready};
|
||||
|
||||
pub mod api_state;
|
||||
pub mod error;
|
||||
@@ -26,9 +26,13 @@ where
|
||||
|
||||
// Protected API endpoints (require auth)
|
||||
let protected = Router::new()
|
||||
.route("/ingress", post(ingest_data))
|
||||
.route(
|
||||
"/ingest",
|
||||
post(ingest_data).layer(DefaultBodyLimit::max(
|
||||
app_state.config.ingest_max_body_bytes,
|
||||
)),
|
||||
)
|
||||
.route("/categories", get(get_categories))
|
||||
.layer(DefaultBodyLimit::max(1024 * 1024 * 1024))
|
||||
.route_layer(from_fn_with_state(app_state.clone(), api_auth));
|
||||
|
||||
public.merge(protected)
|
||||
|
||||
@@ -13,14 +13,12 @@ pub async fn api_auth(
|
||||
mut request: Request,
|
||||
next: Next,
|
||||
) -> Result<Response, ApiError> {
|
||||
let api_key = extract_api_key(&request).ok_or(ApiError::Unauthorized(
|
||||
"You have to be authenticated".to_string(),
|
||||
))?;
|
||||
let api_key = extract_api_key(&request)
|
||||
.ok_or_else(|| ApiError::Unauthorized("You have to be authenticated".to_string()))?;
|
||||
|
||||
let user = User::find_by_api_key(&api_key, &state.db).await?;
|
||||
let user = user.ok_or(ApiError::Unauthorized(
|
||||
"You have to be authenticated".to_string(),
|
||||
))?;
|
||||
let user =
|
||||
user.ok_or_else(|| ApiError::Unauthorized("You have to be authenticated".to_string()))?;
|
||||
|
||||
request.extensions_mut().insert(user);
|
||||
|
||||
@@ -37,7 +35,7 @@ fn extract_api_key(request: &Request) -> Option<String> {
|
||||
.headers()
|
||||
.get("Authorization")
|
||||
.and_then(|v| v.to_str().ok())
|
||||
.and_then(|auth| auth.strip_prefix("Bearer ").map(|s| s.trim()))
|
||||
.and_then(|auth| auth.strip_prefix("Bearer ").map(str::trim))
|
||||
})
|
||||
.map(String::from)
|
||||
}
|
||||
|
||||
88
api-router/src/routes/ingest.rs
Normal file
88
api-router/src/routes/ingest.rs
Normal file
@@ -0,0 +1,88 @@
|
||||
use axum::{extract::State, http::StatusCode, response::IntoResponse, Extension, Json};
|
||||
use axum_typed_multipart::{FieldData, TryFromMultipart, TypedMultipart};
|
||||
use common::{
|
||||
error::AppError,
|
||||
storage::types::{
|
||||
file_info::FileInfo, ingestion_payload::IngestionPayload, ingestion_task::IngestionTask,
|
||||
user::User,
|
||||
},
|
||||
utils::ingest_limits::{validate_ingest_input, IngestValidationError},
|
||||
};
|
||||
use futures::{future::try_join_all, TryFutureExt};
|
||||
use serde_json::json;
|
||||
use tempfile::NamedTempFile;
|
||||
use tracing::info;
|
||||
|
||||
use crate::{api_state::ApiState, error::ApiError};
|
||||
|
||||
#[derive(Debug, TryFromMultipart)]
|
||||
pub struct IngestParams {
|
||||
pub content: Option<String>,
|
||||
pub context: String,
|
||||
pub category: String,
|
||||
#[form_data(limit = "20000000")]
|
||||
#[form_data(default)]
|
||||
pub files: Vec<FieldData<NamedTempFile>>,
|
||||
}
|
||||
|
||||
pub async fn ingest_data(
|
||||
State(state): State<ApiState>,
|
||||
Extension(user): Extension<User>,
|
||||
TypedMultipart(input): TypedMultipart<IngestParams>,
|
||||
) -> Result<impl IntoResponse, ApiError> {
|
||||
let user_id = user.id;
|
||||
let content_bytes = input.content.as_ref().map_or(0, |c| c.len());
|
||||
let has_content = input.content.as_ref().is_some_and(|c| !c.trim().is_empty());
|
||||
let context_bytes = input.context.len();
|
||||
let category_bytes = input.category.len();
|
||||
let file_count = input.files.len();
|
||||
|
||||
match validate_ingest_input(
|
||||
&state.config,
|
||||
input.content.as_deref(),
|
||||
&input.context,
|
||||
&input.category,
|
||||
file_count,
|
||||
) {
|
||||
Ok(()) => {}
|
||||
Err(IngestValidationError::PayloadTooLarge(message)) => {
|
||||
return Err(ApiError::PayloadTooLarge(message));
|
||||
}
|
||||
Err(IngestValidationError::BadRequest(message)) => {
|
||||
return Err(ApiError::ValidationError(message));
|
||||
}
|
||||
}
|
||||
|
||||
info!(
|
||||
user_id = %user_id,
|
||||
has_content,
|
||||
content_bytes,
|
||||
context_bytes,
|
||||
category_bytes,
|
||||
file_count,
|
||||
"Received ingest request"
|
||||
);
|
||||
|
||||
let file_infos = try_join_all(input.files.into_iter().map(|file| {
|
||||
FileInfo::new_with_storage(file, &state.db, &user_id, &state.storage)
|
||||
.map_err(AppError::from)
|
||||
}))
|
||||
.await?;
|
||||
|
||||
let payloads = IngestionPayload::create_ingestion_payload(
|
||||
input.content,
|
||||
input.context,
|
||||
input.category,
|
||||
file_infos,
|
||||
&user_id,
|
||||
)?;
|
||||
|
||||
let futures: Vec<_> = payloads
|
||||
.into_iter()
|
||||
.map(|object| IngestionTask::create_and_add_to_db(object, user_id.clone(), &state.db))
|
||||
.collect();
|
||||
|
||||
try_join_all(futures).await?;
|
||||
|
||||
Ok((StatusCode::OK, Json(json!({ "status": "success" }))))
|
||||
}
|
||||
@@ -1,56 +0,0 @@
|
||||
use axum::{extract::State, http::StatusCode, response::IntoResponse, Extension};
|
||||
use axum_typed_multipart::{FieldData, TryFromMultipart, TypedMultipart};
|
||||
use common::{
|
||||
error::AppError,
|
||||
storage::types::{
|
||||
file_info::FileInfo, ingestion_payload::IngestionPayload, ingestion_task::IngestionTask,
|
||||
user::User,
|
||||
},
|
||||
};
|
||||
use futures::{future::try_join_all, TryFutureExt};
|
||||
use tempfile::NamedTempFile;
|
||||
use tracing::info;
|
||||
|
||||
use crate::{api_state::ApiState, error::ApiError};
|
||||
|
||||
#[derive(Debug, TryFromMultipart)]
|
||||
pub struct IngestParams {
|
||||
pub content: Option<String>,
|
||||
pub context: String,
|
||||
pub category: String,
|
||||
#[form_data(limit = "10000000")] // Adjust limit as needed
|
||||
#[form_data(default)]
|
||||
pub files: Vec<FieldData<NamedTempFile>>,
|
||||
}
|
||||
|
||||
pub async fn ingest_data(
|
||||
State(state): State<ApiState>,
|
||||
Extension(user): Extension<User>,
|
||||
TypedMultipart(input): TypedMultipart<IngestParams>,
|
||||
) -> Result<impl IntoResponse, ApiError> {
|
||||
info!("Received input: {:?}", input);
|
||||
|
||||
let file_infos = try_join_all(input.files.into_iter().map(|file| {
|
||||
FileInfo::new(file, &state.db, &user.id, &state.config).map_err(AppError::from)
|
||||
}))
|
||||
.await?;
|
||||
|
||||
let payloads = IngestionPayload::create_ingestion_payload(
|
||||
input.content,
|
||||
input.context,
|
||||
input.category,
|
||||
file_infos,
|
||||
user.id.as_str(),
|
||||
)?;
|
||||
|
||||
let futures: Vec<_> = payloads
|
||||
.into_iter()
|
||||
.map(|object| {
|
||||
IngestionTask::create_and_add_to_db(object.clone(), user.id.clone(), &state.db)
|
||||
})
|
||||
.collect();
|
||||
|
||||
try_join_all(futures).await?;
|
||||
|
||||
Ok(StatusCode::OK)
|
||||
}
|
||||
@@ -1,4 +1,4 @@
|
||||
pub mod categories;
|
||||
pub mod ingress;
|
||||
pub mod ingest;
|
||||
pub mod liveness;
|
||||
pub mod readiness;
|
||||
|
||||
@@ -4,6 +4,9 @@ version = "0.1.0"
|
||||
edition = "2021"
|
||||
license = "AGPL-3.0-or-later"
|
||||
|
||||
[lints]
|
||||
workspace = true
|
||||
|
||||
[dependencies]
|
||||
# Workspace dependencies
|
||||
tokio = { workspace = true }
|
||||
@@ -13,7 +16,7 @@ tracing = { workspace = true }
|
||||
anyhow = { workspace = true }
|
||||
thiserror = { workspace = true }
|
||||
serde_json = { workspace = true }
|
||||
surrealdb = { workspace = true, features = ["kv-mem"] }
|
||||
surrealdb = { workspace = true }
|
||||
async-openai = { workspace = true }
|
||||
futures = { workspace = true }
|
||||
tempfile = { workspace = true }
|
||||
@@ -41,7 +44,12 @@ surrealdb-migrations = { workspace = true }
|
||||
tokio-retry = { workspace = true }
|
||||
object_store = { workspace = true }
|
||||
bytes = { workspace = true }
|
||||
state-machines = { workspace = true }
|
||||
fastembed = { workspace = true }
|
||||
|
||||
|
||||
[features]
|
||||
test-utils = []
|
||||
test-utils = ["surrealdb/kv-mem"]
|
||||
|
||||
[dev-dependencies]
|
||||
surrealdb = { workspace = true, features = ["kv-mem"] }
|
||||
|
||||
@@ -14,6 +14,9 @@ CREATE system_settings:current CONTENT {
|
||||
query_model: "gpt-4o-mini",
|
||||
processing_model: "gpt-4o-mini",
|
||||
embedding_model: "text-embedding-3-small",
|
||||
voice_processing_model: "whisper-1",
|
||||
image_processing_model: "gpt-4o-mini",
|
||||
image_processing_prompt: "Analyze this image and respond based on its primary content:\n - If the image is mainly text (document, screenshot, sign), transcribe the text verbatim.\n - If the image is mainly visual (photograph, art, landscape), provide a concise description of the scene.\n - For hybrid images (diagrams, ads), briefly describe the visual, then transcribe the text under a Text: heading.\n\n Respond directly with the analysis.",
|
||||
embedding_dimensions: 1536,
|
||||
query_system_prompt: "You are a knowledgeable assistant with access to a specialized knowledge base. You will be provided with relevant knowledge entities from the database as context. Each knowledge entity contains a name, description, and type, representing different concepts, ideas, and information.\nYour task is to:\n1. Carefully analyze the provided knowledge entities in the context\n2. Answer user questions based on this information\n3. Provide clear, concise, and accurate responses\n4. When referencing information, briefly mention which knowledge entity it came from\n5. If the provided context doesn't contain enough information to answer the question confidently, clearly state this\n6. If only partial information is available, explain what you can answer and what information is missing\n7. Avoid making assumptions or providing information not supported by the context\n8. Output the references to the documents. Use the UUIDs and make sure they are correct!\nRemember:\n- Be direct and honest about the limitations of your knowledge\n- Cite the relevant knowledge entities when providing information, but only provide the UUIDs in the reference array\n- If you need to combine information from multiple entities, explain how they connect\n- Don't speculate beyond what's provided in the context\nExample response formats:\n\"Based on [Entity Name], [answer...]\"\n\"I found relevant information in multiple entries: [explanation...]\"\n\"I apologize, but the provided context doesn't contain information about [topic]\"",
|
||||
ingestion_system_prompt: "You are an AI assistant. You will receive a text content, along with user context and a category. Your task is to provide a structured JSON object representing the content in a graph format suitable for a graph database. You will also be presented with some existing knowledge_entities from the database, do not replicate these! Your task is to create meaningful knowledge entities from the submitted content. Try and infer as much as possible from the users context and category when creating these. If the user submits a large content, create more general entities. If the user submits a narrow and precise content, try and create precise knowledge entities.\nThe JSON should have the following structure:\n{\n\"knowledge_entities\": [\n{\n\"key\": \"unique-key-1\",\n\"name\": \"Entity Name\",\n\"description\": \"A detailed description of the entity.\",\n\"entity_type\": \"TypeOfEntity\"\n},\n// More entities...\n],\n\"relationships\": [\n{\n\"type\": \"RelationshipType\",\n\"source\": \"unique-key-1 or UUID from existing database\",\n\"target\": \"unique-key-1 or UUID from existing database\"\n},\n// More relationships...\n]\n}\nGuidelines:\n1. Do NOT generate any IDs or UUIDs. Use a unique `key` for each knowledge entity.\n2. Each KnowledgeEntity should have a unique `key`, a meaningful `name`, and a descriptive `description`.\n3. Define the type of each KnowledgeEntity using the following categories: Idea, Project, Document, Page, TextSnippet.\n4. Establish relationships between entities using types like RelatedTo, RelevantTo, SimilarTo.\n5. Use the `source` key to indicate the originating entity and the `target` key to indicate the related entity\"\n6. You will be presented with a few existing KnowledgeEntities that are similar to the current ones. They will have an existing UUID. When creating relationships to these entities, use their UUID.\n7. Only create relationships between existing KnowledgeEntities.\n8. Entities that exist already in the database should NOT be created again. If there is only a minor overlap, skip creating a new entity.\n9. A new relationship MUST include a newly created KnowledgeEntity."
|
||||
|
||||
@@ -1,27 +1,2 @@
|
||||
DEFINE ANALYZER IF NOT EXISTS app_default_fts_analyzer
|
||||
TOKENIZERS class
|
||||
FILTERS lowercase, ascii;
|
||||
|
||||
DEFINE INDEX IF NOT EXISTS text_content_fts_text_idx ON TABLE text_content
|
||||
FIELDS text
|
||||
SEARCH ANALYZER app_default_fts_analyzer BM25 HIGHLIGHTS;
|
||||
|
||||
DEFINE INDEX IF NOT EXISTS text_content_fts_category_idx ON TABLE text_content
|
||||
FIELDS category
|
||||
SEARCH ANALYZER app_default_fts_analyzer BM25 HIGHLIGHTS;
|
||||
|
||||
DEFINE INDEX IF NOT EXISTS text_content_fts_context_idx ON TABLE text_content
|
||||
FIELDS context
|
||||
SEARCH ANALYZER app_default_fts_analyzer BM25 HIGHLIGHTS;
|
||||
|
||||
DEFINE INDEX IF NOT EXISTS text_content_fts_file_name_idx ON TABLE text_content
|
||||
FIELDS file_info.file_name
|
||||
SEARCH ANALYZER app_default_fts_analyzer BM25 HIGHLIGHTS;
|
||||
|
||||
DEFINE INDEX IF NOT EXISTS text_content_fts_url_idx ON TABLE text_content
|
||||
FIELDS url_info.url
|
||||
SEARCH ANALYZER app_default_fts_analyzer BM25 HIGHLIGHTS;
|
||||
|
||||
DEFINE INDEX IF NOT EXISTS text_content_fts_url_title_idx ON TABLE text_content
|
||||
FIELDS url_info.title
|
||||
SEARCH ANALYZER app_default_fts_analyzer BM25 HIGHLIGHTS;
|
||||
-- Runtime-managed: text_content FTS indexes now created at startup via the shared Surreal helper.
|
||||
-- This migration is intentionally left as a no-op to avoid heavy index builds during migration.
|
||||
|
||||
@@ -1 +1 @@
|
||||
REMOVE TABLE job;
|
||||
-- No-op: legacy `job` table was superseded by `ingestion_task`; kept for migration order compatibility.
|
||||
|
||||
@@ -0,0 +1 @@
|
||||
-- Runtime-managed: FTS indexes now built at startup; migration retained as a no-op.
|
||||
173
common/migrations/20251012_205900_state_machine_migration.surql
Normal file
173
common/migrations/20251012_205900_state_machine_migration.surql
Normal file
@@ -0,0 +1,173 @@
|
||||
-- State machine migration for ingestion_task records
|
||||
|
||||
DEFINE FIELD IF NOT EXISTS state ON TABLE ingestion_task TYPE option<string>;
|
||||
DEFINE FIELD IF NOT EXISTS attempts ON TABLE ingestion_task TYPE option<number>;
|
||||
DEFINE FIELD IF NOT EXISTS max_attempts ON TABLE ingestion_task TYPE option<number>;
|
||||
DEFINE FIELD IF NOT EXISTS scheduled_at ON TABLE ingestion_task TYPE option<datetime>;
|
||||
DEFINE FIELD IF NOT EXISTS locked_at ON TABLE ingestion_task TYPE option<datetime>;
|
||||
DEFINE FIELD IF NOT EXISTS lease_duration_secs ON TABLE ingestion_task TYPE option<number>;
|
||||
DEFINE FIELD IF NOT EXISTS worker_id ON TABLE ingestion_task TYPE option<string>;
|
||||
DEFINE FIELD IF NOT EXISTS error_code ON TABLE ingestion_task TYPE option<string>;
|
||||
DEFINE FIELD IF NOT EXISTS error_message ON TABLE ingestion_task TYPE option<string>;
|
||||
DEFINE FIELD IF NOT EXISTS last_error_at ON TABLE ingestion_task TYPE option<datetime>;
|
||||
DEFINE FIELD IF NOT EXISTS priority ON TABLE ingestion_task TYPE option<number>;
|
||||
|
||||
REMOVE FIELD status ON TABLE ingestion_task;
|
||||
DEFINE FIELD status ON TABLE ingestion_task TYPE option<object>;
|
||||
|
||||
DEFINE INDEX IF NOT EXISTS idx_ingestion_task_state_sched ON TABLE ingestion_task FIELDS state, scheduled_at;
|
||||
|
||||
LET $needs_migration = (SELECT count() AS count FROM type::table('ingestion_task') WHERE state = NONE)[0].count;
|
||||
|
||||
IF $needs_migration > 0 THEN {
|
||||
-- Created -> Pending
|
||||
UPDATE type::table('ingestion_task')
|
||||
SET
|
||||
state = "Pending",
|
||||
attempts = 0,
|
||||
max_attempts = 3,
|
||||
scheduled_at = IF created_at != NONE THEN created_at ELSE time::now() END,
|
||||
locked_at = NONE,
|
||||
lease_duration_secs = 300,
|
||||
worker_id = NONE,
|
||||
error_code = NONE,
|
||||
error_message = NONE,
|
||||
last_error_at = NONE,
|
||||
priority = 0
|
||||
WHERE state = NONE
|
||||
AND status != NONE
|
||||
AND status.name = "Created";
|
||||
|
||||
-- InProgress -> Processing
|
||||
UPDATE type::table('ingestion_task')
|
||||
SET
|
||||
state = "Processing",
|
||||
attempts = IF status.attempts != NONE THEN status.attempts ELSE 1 END,
|
||||
max_attempts = 3,
|
||||
scheduled_at = IF status.last_attempt != NONE THEN status.last_attempt ELSE time::now() END,
|
||||
locked_at = IF status.last_attempt != NONE THEN status.last_attempt ELSE time::now() END,
|
||||
lease_duration_secs = 300,
|
||||
worker_id = NONE,
|
||||
error_code = NONE,
|
||||
error_message = NONE,
|
||||
last_error_at = NONE,
|
||||
priority = 0
|
||||
WHERE state = NONE
|
||||
AND status != NONE
|
||||
AND status.name = "InProgress";
|
||||
|
||||
-- Completed -> Succeeded
|
||||
UPDATE type::table('ingestion_task')
|
||||
SET
|
||||
state = "Succeeded",
|
||||
attempts = 1,
|
||||
max_attempts = 3,
|
||||
scheduled_at = IF updated_at != NONE THEN updated_at ELSE time::now() END,
|
||||
locked_at = NONE,
|
||||
lease_duration_secs = 300,
|
||||
worker_id = NONE,
|
||||
error_code = NONE,
|
||||
error_message = NONE,
|
||||
last_error_at = NONE,
|
||||
priority = 0
|
||||
WHERE state = NONE
|
||||
AND status != NONE
|
||||
AND status.name = "Completed";
|
||||
|
||||
-- Error -> DeadLetter (terminal failure)
|
||||
UPDATE type::table('ingestion_task')
|
||||
SET
|
||||
state = "DeadLetter",
|
||||
attempts = 3,
|
||||
max_attempts = 3,
|
||||
scheduled_at = IF updated_at != NONE THEN updated_at ELSE time::now() END,
|
||||
locked_at = NONE,
|
||||
lease_duration_secs = 300,
|
||||
worker_id = NONE,
|
||||
error_code = NONE,
|
||||
error_message = status.message,
|
||||
last_error_at = IF updated_at != NONE THEN updated_at ELSE time::now() END,
|
||||
priority = 0
|
||||
WHERE state = NONE
|
||||
AND status != NONE
|
||||
AND status.name = "Error";
|
||||
|
||||
-- Cancelled -> Cancelled
|
||||
UPDATE type::table('ingestion_task')
|
||||
SET
|
||||
state = "Cancelled",
|
||||
attempts = 0,
|
||||
max_attempts = 3,
|
||||
scheduled_at = IF updated_at != NONE THEN updated_at ELSE time::now() END,
|
||||
locked_at = NONE,
|
||||
lease_duration_secs = 300,
|
||||
worker_id = NONE,
|
||||
error_code = NONE,
|
||||
error_message = NONE,
|
||||
last_error_at = NONE,
|
||||
priority = 0
|
||||
WHERE state = NONE
|
||||
AND status != NONE
|
||||
AND status.name = "Cancelled";
|
||||
|
||||
-- Fallback for any remaining records missing state
|
||||
UPDATE type::table('ingestion_task')
|
||||
SET
|
||||
state = "Pending",
|
||||
attempts = 0,
|
||||
max_attempts = 3,
|
||||
scheduled_at = IF updated_at != NONE THEN updated_at ELSE time::now() END,
|
||||
locked_at = NONE,
|
||||
lease_duration_secs = 300,
|
||||
worker_id = NONE,
|
||||
error_code = NONE,
|
||||
error_message = NONE,
|
||||
last_error_at = NONE,
|
||||
priority = 0
|
||||
WHERE state = NONE;
|
||||
} END;
|
||||
|
||||
-- Ensure defaults for newly added fields
|
||||
UPDATE type::table('ingestion_task')
|
||||
SET max_attempts = 3
|
||||
WHERE max_attempts = NONE;
|
||||
|
||||
UPDATE type::table('ingestion_task')
|
||||
SET lease_duration_secs = 300
|
||||
WHERE lease_duration_secs = NONE;
|
||||
|
||||
UPDATE type::table('ingestion_task')
|
||||
SET attempts = 0
|
||||
WHERE attempts = NONE;
|
||||
|
||||
UPDATE type::table('ingestion_task')
|
||||
SET priority = 0
|
||||
WHERE priority = NONE;
|
||||
|
||||
UPDATE type::table('ingestion_task')
|
||||
SET scheduled_at = IF updated_at != NONE THEN updated_at ELSE time::now() END
|
||||
WHERE scheduled_at = NONE;
|
||||
|
||||
UPDATE type::table('ingestion_task')
|
||||
SET locked_at = NONE
|
||||
WHERE locked_at = NONE;
|
||||
|
||||
UPDATE type::table('ingestion_task')
|
||||
SET worker_id = NONE
|
||||
WHERE worker_id != NONE AND worker_id = "";
|
||||
|
||||
UPDATE type::table('ingestion_task')
|
||||
SET error_code = NONE
|
||||
WHERE error_code = NONE;
|
||||
|
||||
UPDATE type::table('ingestion_task')
|
||||
SET error_message = NONE
|
||||
WHERE error_message = NONE;
|
||||
|
||||
UPDATE type::table('ingestion_task')
|
||||
SET last_error_at = NONE
|
||||
WHERE last_error_at = NONE;
|
||||
|
||||
UPDATE type::table('ingestion_task')
|
||||
SET status = NONE
|
||||
WHERE status != NONE;
|
||||
24
common/migrations/20251022_120302_add_scratchpad_table.surql
Normal file
24
common/migrations/20251022_120302_add_scratchpad_table.surql
Normal file
@@ -0,0 +1,24 @@
|
||||
-- Add scratchpad table and schema
|
||||
|
||||
-- Define scratchpad table and schema
|
||||
DEFINE TABLE IF NOT EXISTS scratchpad SCHEMALESS;
|
||||
|
||||
-- Standard fields from stored_object! macro
|
||||
DEFINE FIELD IF NOT EXISTS created_at ON scratchpad TYPE datetime;
|
||||
DEFINE FIELD IF NOT EXISTS updated_at ON scratchpad TYPE datetime;
|
||||
|
||||
-- Custom fields from the Scratchpad struct
|
||||
DEFINE FIELD IF NOT EXISTS user_id ON scratchpad TYPE string;
|
||||
DEFINE FIELD IF NOT EXISTS title ON scratchpad TYPE string;
|
||||
DEFINE FIELD IF NOT EXISTS content ON scratchpad TYPE string;
|
||||
DEFINE FIELD IF NOT EXISTS last_saved_at ON scratchpad TYPE datetime;
|
||||
DEFINE FIELD IF NOT EXISTS is_dirty ON scratchpad TYPE bool DEFAULT false;
|
||||
DEFINE FIELD IF NOT EXISTS is_archived ON scratchpad TYPE bool DEFAULT false;
|
||||
DEFINE FIELD IF NOT EXISTS archived_at ON scratchpad TYPE option<datetime>;
|
||||
DEFINE FIELD IF NOT EXISTS ingested_at ON scratchpad TYPE option<datetime>;
|
||||
|
||||
-- Indexes based on query patterns
|
||||
DEFINE INDEX IF NOT EXISTS scratchpad_user_idx ON scratchpad FIELDS user_id;
|
||||
DEFINE INDEX IF NOT EXISTS scratchpad_user_archived_idx ON scratchpad FIELDS user_id, is_archived;
|
||||
DEFINE INDEX IF NOT EXISTS scratchpad_updated_idx ON scratchpad FIELDS updated_at;
|
||||
DEFINE INDEX IF NOT EXISTS scratchpad_archived_idx ON scratchpad FIELDS archived_at;
|
||||
@@ -0,0 +1,18 @@
|
||||
-- Remove HNSW indexes from base tables (now created at runtime on *_embedding tables)
|
||||
REMOVE INDEX IF EXISTS idx_embedding_entities ON knowledge_entity;
|
||||
REMOVE INDEX IF EXISTS idx_embedding_chunks ON text_chunk;
|
||||
|
||||
-- Remove FTS indexes (now created at runtime via indexes.rs)
|
||||
REMOVE INDEX IF EXISTS text_content_fts_text_idx ON text_content;
|
||||
REMOVE INDEX IF EXISTS text_content_fts_category_idx ON text_content;
|
||||
REMOVE INDEX IF EXISTS text_content_fts_context_idx ON text_content;
|
||||
REMOVE INDEX IF EXISTS text_content_fts_file_name_idx ON text_content;
|
||||
REMOVE INDEX IF EXISTS text_content_fts_url_idx ON text_content;
|
||||
REMOVE INDEX IF EXISTS text_content_fts_url_title_idx ON text_content;
|
||||
REMOVE INDEX IF EXISTS knowledge_entity_fts_name_idx ON knowledge_entity;
|
||||
REMOVE INDEX IF EXISTS knowledge_entity_fts_description_idx ON knowledge_entity;
|
||||
REMOVE INDEX IF EXISTS text_chunk_fts_chunk_idx ON text_chunk;
|
||||
|
||||
-- Remove legacy analyzers (recreated at runtime with updated configuration)
|
||||
REMOVE ANALYZER IF EXISTS app_default_fts_analyzer;
|
||||
REMOVE ANALYZER IF EXISTS app_en_fts_analyzer;
|
||||
@@ -0,0 +1,23 @@
|
||||
-- Move chunk/entity embeddings to dedicated tables for index efficiency.
|
||||
|
||||
-- Text chunk embeddings table
|
||||
DEFINE TABLE IF NOT EXISTS text_chunk_embedding SCHEMAFULL;
|
||||
DEFINE FIELD IF NOT EXISTS created_at ON text_chunk_embedding TYPE datetime;
|
||||
DEFINE FIELD IF NOT EXISTS updated_at ON text_chunk_embedding TYPE datetime;
|
||||
DEFINE FIELD IF NOT EXISTS user_id ON text_chunk_embedding TYPE string;
|
||||
DEFINE FIELD IF NOT EXISTS source_id ON text_chunk_embedding TYPE string;
|
||||
DEFINE FIELD IF NOT EXISTS chunk_id ON text_chunk_embedding TYPE record<text_chunk>;
|
||||
DEFINE FIELD IF NOT EXISTS embedding ON text_chunk_embedding TYPE array<float>;
|
||||
DEFINE INDEX IF NOT EXISTS text_chunk_embedding_chunk_id_idx ON text_chunk_embedding FIELDS chunk_id;
|
||||
DEFINE INDEX IF NOT EXISTS text_chunk_embedding_user_id_idx ON text_chunk_embedding FIELDS user_id;
|
||||
DEFINE INDEX IF NOT EXISTS text_chunk_embedding_source_id_idx ON text_chunk_embedding FIELDS source_id;
|
||||
|
||||
-- Knowledge entity embeddings table
|
||||
DEFINE TABLE IF NOT EXISTS knowledge_entity_embedding SCHEMAFULL;
|
||||
DEFINE FIELD IF NOT EXISTS created_at ON knowledge_entity_embedding TYPE datetime;
|
||||
DEFINE FIELD IF NOT EXISTS updated_at ON knowledge_entity_embedding TYPE datetime;
|
||||
DEFINE FIELD IF NOT EXISTS user_id ON knowledge_entity_embedding TYPE string;
|
||||
DEFINE FIELD IF NOT EXISTS entity_id ON knowledge_entity_embedding TYPE record<knowledge_entity>;
|
||||
DEFINE FIELD IF NOT EXISTS embedding ON knowledge_entity_embedding TYPE array<float>;
|
||||
DEFINE INDEX IF NOT EXISTS knowledge_entity_embedding_entity_id_idx ON knowledge_entity_embedding FIELDS entity_id;
|
||||
DEFINE INDEX IF NOT EXISTS knowledge_entity_embedding_user_id_idx ON knowledge_entity_embedding FIELDS user_id;
|
||||
@@ -0,0 +1,23 @@
|
||||
-- Copy embeddings from base tables to dedicated tables
|
||||
-- This runs BEFORE the field removal migration
|
||||
|
||||
FOR $chunk IN (SELECT * FROM text_chunk WHERE embedding != NONE AND array::len(embedding) > 0) {
|
||||
CREATE text_chunk_embedding CONTENT {
|
||||
chunk_id: $chunk.id,
|
||||
embedding: $chunk.embedding,
|
||||
user_id: $chunk.user_id,
|
||||
source_id: $chunk.source_id,
|
||||
created_at: $chunk.created_at,
|
||||
updated_at: $chunk.updated_at
|
||||
};
|
||||
};
|
||||
|
||||
FOR $entity IN (SELECT * FROM knowledge_entity WHERE embedding != NONE AND array::len(embedding) > 0) {
|
||||
CREATE knowledge_entity_embedding CONTENT {
|
||||
entity_id: $entity.id,
|
||||
embedding: $entity.embedding,
|
||||
user_id: $entity.user_id,
|
||||
created_at: $entity.created_at,
|
||||
updated_at: $entity.updated_at
|
||||
};
|
||||
};
|
||||
@@ -0,0 +1,3 @@
|
||||
-- Drop legacy embedding fields from base tables; embeddings now live in *_embedding tables.
|
||||
REMOVE FIELD IF EXISTS embedding ON TABLE text_chunk;
|
||||
REMOVE FIELD IF EXISTS embedding ON TABLE knowledge_entity;
|
||||
@@ -0,0 +1,8 @@
|
||||
-- Add embedding_backend field to system_settings for visibility of active backend
|
||||
|
||||
DEFINE FIELD IF NOT EXISTS embedding_backend ON system_settings TYPE option<string>;
|
||||
|
||||
-- Set default to 'openai' for existing installs to preserve backward compatibility
|
||||
UPDATE system_settings:current SET
|
||||
embedding_backend = 'openai'
|
||||
WHERE embedding_backend == NONE;
|
||||
97
common/migrations/20251231_enforce_schemafull.surql
Normal file
97
common/migrations/20251231_enforce_schemafull.surql
Normal file
@@ -0,0 +1,97 @@
|
||||
-- Enforce SCHEMAFULL on all tables and define missing fields
|
||||
|
||||
-- 1. Define missing fields for ingestion_task (formerly job, but now ingestion_task)
|
||||
DEFINE TABLE OVERWRITE ingestion_task SCHEMAFULL;
|
||||
|
||||
-- Core Fields
|
||||
DEFINE FIELD IF NOT EXISTS id ON ingestion_task TYPE record<ingestion_task>;
|
||||
DEFINE FIELD IF NOT EXISTS created_at ON ingestion_task TYPE datetime DEFAULT time::now();
|
||||
DEFINE FIELD IF NOT EXISTS updated_at ON ingestion_task TYPE datetime DEFAULT time::now();
|
||||
DEFINE FIELD IF NOT EXISTS user_id ON ingestion_task TYPE string;
|
||||
|
||||
-- State Machine Fields
|
||||
DEFINE FIELD IF NOT EXISTS state ON ingestion_task TYPE string ASSERT $value IN ['Pending', 'Reserved', 'Processing', 'Succeeded', 'Failed', 'Cancelled', 'DeadLetter'];
|
||||
DEFINE FIELD IF NOT EXISTS attempts ON ingestion_task TYPE int DEFAULT 0;
|
||||
DEFINE FIELD IF NOT EXISTS max_attempts ON ingestion_task TYPE int DEFAULT 3;
|
||||
DEFINE FIELD IF NOT EXISTS scheduled_at ON ingestion_task TYPE datetime DEFAULT time::now();
|
||||
DEFINE FIELD IF NOT EXISTS locked_at ON ingestion_task TYPE option<datetime>;
|
||||
DEFINE FIELD IF NOT EXISTS lease_duration_secs ON ingestion_task TYPE int DEFAULT 300;
|
||||
DEFINE FIELD IF NOT EXISTS worker_id ON ingestion_task TYPE option<string>;
|
||||
DEFINE FIELD IF NOT EXISTS error_code ON ingestion_task TYPE option<string>;
|
||||
DEFINE FIELD IF NOT EXISTS error_message ON ingestion_task TYPE option<string>;
|
||||
DEFINE FIELD IF NOT EXISTS last_error_at ON ingestion_task TYPE option<datetime>;
|
||||
DEFINE FIELD IF NOT EXISTS priority ON ingestion_task TYPE int DEFAULT 0;
|
||||
|
||||
-- Content Payload (IngestionPayload Enum)
|
||||
DEFINE FIELD IF NOT EXISTS content ON ingestion_task TYPE object;
|
||||
DEFINE FIELD IF NOT EXISTS content.Url ON ingestion_task TYPE option<object>;
|
||||
DEFINE FIELD IF NOT EXISTS content.Text ON ingestion_task TYPE option<object>;
|
||||
DEFINE FIELD IF NOT EXISTS content.File ON ingestion_task TYPE option<object>;
|
||||
|
||||
-- Content: Url Variant
|
||||
DEFINE FIELD IF NOT EXISTS content.Url.url ON ingestion_task TYPE string;
|
||||
DEFINE FIELD IF NOT EXISTS content.Url.context ON ingestion_task TYPE string;
|
||||
DEFINE FIELD IF NOT EXISTS content.Url.category ON ingestion_task TYPE string;
|
||||
DEFINE FIELD IF NOT EXISTS content.Url.user_id ON ingestion_task TYPE string;
|
||||
|
||||
-- Content: Text Variant
|
||||
DEFINE FIELD IF NOT EXISTS content.Text.text ON ingestion_task TYPE string;
|
||||
DEFINE FIELD IF NOT EXISTS content.Text.context ON ingestion_task TYPE string;
|
||||
DEFINE FIELD IF NOT EXISTS content.Text.category ON ingestion_task TYPE string;
|
||||
DEFINE FIELD IF NOT EXISTS content.Text.user_id ON ingestion_task TYPE string;
|
||||
|
||||
-- Content: File Variant
|
||||
DEFINE FIELD IF NOT EXISTS content.File.context ON ingestion_task TYPE string;
|
||||
DEFINE FIELD IF NOT EXISTS content.File.category ON ingestion_task TYPE string;
|
||||
DEFINE FIELD IF NOT EXISTS content.File.user_id ON ingestion_task TYPE string;
|
||||
DEFINE FIELD IF NOT EXISTS content.File.file_info ON ingestion_task TYPE object;
|
||||
|
||||
-- Content: File.file_info (FileInfo Struct)
|
||||
DEFINE FIELD IF NOT EXISTS content.File.file_info.id ON ingestion_task TYPE string;
|
||||
DEFINE FIELD IF NOT EXISTS content.File.file_info.created_at ON ingestion_task TYPE datetime;
|
||||
DEFINE FIELD IF NOT EXISTS content.File.file_info.updated_at ON ingestion_task TYPE datetime;
|
||||
DEFINE FIELD IF NOT EXISTS content.File.file_info.sha256 ON ingestion_task TYPE string;
|
||||
DEFINE FIELD IF NOT EXISTS content.File.file_info.path ON ingestion_task TYPE string;
|
||||
DEFINE FIELD IF NOT EXISTS content.File.file_info.file_name ON ingestion_task TYPE string;
|
||||
DEFINE FIELD IF NOT EXISTS content.File.file_info.mime_type ON ingestion_task TYPE string;
|
||||
DEFINE FIELD IF NOT EXISTS content.File.file_info.user_id ON ingestion_task TYPE string;
|
||||
|
||||
-- 2. Enforce SCHEMAFULL on all other tables
|
||||
DEFINE TABLE OVERWRITE analytics SCHEMAFULL;
|
||||
DEFINE TABLE OVERWRITE conversation SCHEMAFULL;
|
||||
DEFINE TABLE OVERWRITE file SCHEMAFULL;
|
||||
DEFINE TABLE OVERWRITE knowledge_entity SCHEMAFULL;
|
||||
DEFINE TABLE OVERWRITE message SCHEMAFULL;
|
||||
DEFINE TABLE OVERWRITE relates_to SCHEMAFULL TYPE RELATION;
|
||||
DEFINE FIELD IF NOT EXISTS in ON relates_to TYPE record<knowledge_entity>;
|
||||
DEFINE FIELD IF NOT EXISTS out ON relates_to TYPE record<knowledge_entity>;
|
||||
DEFINE FIELD IF NOT EXISTS metadata ON relates_to TYPE object;
|
||||
DEFINE FIELD IF NOT EXISTS metadata.user_id ON relates_to TYPE string;
|
||||
DEFINE FIELD IF NOT EXISTS metadata.source_id ON relates_to TYPE string;
|
||||
DEFINE FIELD IF NOT EXISTS metadata.relationship_type ON relates_to TYPE string;
|
||||
DEFINE TABLE OVERWRITE scratchpad SCHEMAFULL;
|
||||
DEFINE TABLE OVERWRITE system_settings SCHEMAFULL;
|
||||
DEFINE TABLE OVERWRITE text_chunk SCHEMAFULL;
|
||||
-- text_content must have fields defined before enforcing SCHEMAFULL
|
||||
DEFINE TABLE OVERWRITE text_content SCHEMAFULL;
|
||||
DEFINE FIELD IF NOT EXISTS created_at ON text_content TYPE datetime;
|
||||
DEFINE FIELD IF NOT EXISTS updated_at ON text_content TYPE datetime;
|
||||
DEFINE FIELD IF NOT EXISTS text ON text_content TYPE string;
|
||||
DEFINE FIELD IF NOT EXISTS file_info ON text_content TYPE option<object>;
|
||||
DEFINE FIELD IF NOT EXISTS url_info ON text_content TYPE option<object>;
|
||||
DEFINE FIELD IF NOT EXISTS url_info.url ON text_content TYPE string;
|
||||
DEFINE FIELD IF NOT EXISTS url_info.title ON text_content TYPE string;
|
||||
DEFINE FIELD IF NOT EXISTS url_info.image_id ON text_content TYPE string;
|
||||
DEFINE FIELD IF NOT EXISTS context ON text_content TYPE option<string>;
|
||||
DEFINE FIELD IF NOT EXISTS category ON text_content TYPE string;
|
||||
DEFINE FIELD IF NOT EXISTS user_id ON text_content TYPE string;
|
||||
DEFINE FIELD IF NOT EXISTS file_info.id ON text_content TYPE string;
|
||||
DEFINE FIELD IF NOT EXISTS file_info.created_at ON text_content TYPE datetime;
|
||||
DEFINE FIELD IF NOT EXISTS file_info.updated_at ON text_content TYPE datetime;
|
||||
DEFINE FIELD IF NOT EXISTS file_info.sha256 ON text_content TYPE string;
|
||||
DEFINE FIELD IF NOT EXISTS file_info.path ON text_content TYPE string;
|
||||
DEFINE FIELD IF NOT EXISTS file_info.file_name ON text_content TYPE string;
|
||||
DEFINE FIELD IF NOT EXISTS file_info.mime_type ON text_content TYPE string;
|
||||
DEFINE FIELD IF NOT EXISTS file_info.user_id ON text_content TYPE string;
|
||||
|
||||
DEFINE TABLE OVERWRITE user SCHEMAFULL;
|
||||
@@ -0,0 +1 @@
|
||||
DEFINE FIELD IF NOT EXISTS theme ON user TYPE string DEFAULT "system";
|
||||
@@ -1 +0,0 @@
|
||||
{"schemas":"--- original\n+++ modified\n@@ -98,7 +98,7 @@\n DEFINE INDEX IF NOT EXISTS knowledge_entity_user_id_idx ON knowledge_entity FIELDS user_id;\n DEFINE INDEX IF NOT EXISTS knowledge_entity_source_id_idx ON knowledge_entity FIELDS source_id;\n DEFINE INDEX IF NOT EXISTS knowledge_entity_entity_type_idx ON knowledge_entity FIELDS entity_type;\n-DEFINE INDEX IF NOT EXISTS knowledge_entity_created_at_idx ON knowledge_entity FIELDS created_at; # For get_latest_knowledge_entities\n+DEFINE INDEX IF NOT EXISTS knowledge_entity_created_at_idx ON knowledge_entity FIELDS created_at;\n\n # Defines the schema for the 'message' table.\n\n@@ -157,6 +157,8 @@\n DEFINE FIELD IF NOT EXISTS require_email_verification ON system_settings TYPE bool;\n DEFINE FIELD IF NOT EXISTS query_model ON system_settings TYPE string;\n DEFINE FIELD IF NOT EXISTS processing_model ON system_settings TYPE string;\n+DEFINE FIELD IF NOT EXISTS embedding_model ON system_settings TYPE string;\n+DEFINE FIELD IF NOT EXISTS embedding_dimensions ON system_settings TYPE int;\n DEFINE FIELD IF NOT EXISTS query_system_prompt ON system_settings TYPE string;\n DEFINE FIELD IF NOT EXISTS ingestion_system_prompt ON system_settings TYPE string;\n\n","events":null}
|
||||
@@ -1 +0,0 @@
|
||||
{"schemas":"--- original\n+++ modified\n@@ -51,23 +51,23 @@\n\n # Defines the schema for the 'ingestion_task' table (used by IngestionTask).\n\n-DEFINE TABLE IF NOT EXISTS job SCHEMALESS;\n+DEFINE TABLE IF NOT EXISTS ingestion_task SCHEMALESS;\n\n # Standard fields\n-DEFINE FIELD IF NOT EXISTS created_at ON job TYPE string;\n-DEFINE FIELD IF NOT EXISTS updated_at ON job TYPE string;\n+DEFINE FIELD IF NOT EXISTS created_at ON ingestion_task TYPE string;\n+DEFINE FIELD IF NOT EXISTS updated_at ON ingestion_task TYPE string;\n\n # Custom fields from the IngestionTask struct\n # IngestionPayload is complex, store as object\n-DEFINE FIELD IF NOT EXISTS content ON job TYPE object;\n+DEFINE FIELD IF NOT EXISTS content ON ingestion_task TYPE object;\n # IngestionTaskStatus can hold data (InProgress), store as object\n-DEFINE FIELD IF NOT EXISTS status ON job TYPE object;\n-DEFINE FIELD IF NOT EXISTS user_id ON job TYPE string;\n+DEFINE FIELD IF NOT EXISTS status ON ingestion_task TYPE object;\n+DEFINE FIELD IF NOT EXISTS user_id ON ingestion_task TYPE string;\n\n # Indexes explicitly defined in build_indexes and useful for get_unfinished_tasks\n-DEFINE INDEX IF NOT EXISTS idx_job_status ON job FIELDS status;\n-DEFINE INDEX IF NOT EXISTS idx_job_user ON job FIELDS user_id;\n-DEFINE INDEX IF NOT EXISTS idx_job_created ON job FIELDS created_at;\n+DEFINE INDEX IF NOT EXISTS idx_ingestion_task_status ON ingestion_task FIELDS status;\n+DEFINE INDEX IF NOT EXISTS idx_ingestion_task_user ON ingestion_task FIELDS user_id;\n+DEFINE INDEX IF NOT EXISTS idx_ingestion_task_created ON ingestion_task FIELDS created_at;\n\n # Defines the schema for the 'knowledge_entity' table.\n\n","events":null}
|
||||
@@ -1 +0,0 @@
|
||||
{"schemas":"--- original\n+++ modified\n@@ -57,10 +57,7 @@\n DEFINE FIELD IF NOT EXISTS created_at ON ingestion_task TYPE string;\n DEFINE FIELD IF NOT EXISTS updated_at ON ingestion_task TYPE string;\n\n-# Custom fields from the IngestionTask struct\n-# IngestionPayload is complex, store as object\n DEFINE FIELD IF NOT EXISTS content ON ingestion_task TYPE object;\n-# IngestionTaskStatus can hold data (InProgress), store as object\n DEFINE FIELD IF NOT EXISTS status ON ingestion_task TYPE object;\n DEFINE FIELD IF NOT EXISTS user_id ON ingestion_task TYPE string;\n\n@@ -157,10 +154,12 @@\n DEFINE FIELD IF NOT EXISTS require_email_verification ON system_settings TYPE bool;\n DEFINE FIELD IF NOT EXISTS query_model ON system_settings TYPE string;\n DEFINE FIELD IF NOT EXISTS processing_model ON system_settings TYPE string;\n+DEFINE FIELD IF NOT EXISTS image_processing_model ON system_settings TYPE string;\n DEFINE FIELD IF NOT EXISTS embedding_model ON system_settings TYPE string;\n DEFINE FIELD IF NOT EXISTS embedding_dimensions ON system_settings TYPE int;\n DEFINE FIELD IF NOT EXISTS query_system_prompt ON system_settings TYPE string;\n DEFINE FIELD IF NOT EXISTS ingestion_system_prompt ON system_settings TYPE string;\n+DEFINE FIELD IF NOT EXISTS image_processing_prompt ON system_settings TYPE string;\n\n # Defines the schema for the 'text_chunk' table.\n\n","events":null}
|
||||
@@ -1 +0,0 @@
|
||||
{"schemas":"--- original\n+++ modified\n@@ -160,6 +160,7 @@\n DEFINE FIELD IF NOT EXISTS query_system_prompt ON system_settings TYPE string;\n DEFINE FIELD IF NOT EXISTS ingestion_system_prompt ON system_settings TYPE string;\n DEFINE FIELD IF NOT EXISTS image_processing_prompt ON system_settings TYPE string;\n+DEFINE FIELD IF NOT EXISTS voice_processing_model ON system_settings TYPE string;\n\n # Defines the schema for the 'text_chunk' table.\n\n","events":null}
|
||||
@@ -1 +0,0 @@
|
||||
{"schemas":"--- original\n+++ modified\n@@ -18,8 +18,8 @@\n DEFINE TABLE IF NOT EXISTS conversation SCHEMALESS;\n\n # Standard fields\n-DEFINE FIELD IF NOT EXISTS created_at ON conversation TYPE string;\n-DEFINE FIELD IF NOT EXISTS updated_at ON conversation TYPE string;\n+DEFINE FIELD IF NOT EXISTS created_at ON conversation TYPE datetime;\n+DEFINE FIELD IF NOT EXISTS updated_at ON conversation TYPE datetime;\n\n # Custom fields from the Conversation struct\n DEFINE FIELD IF NOT EXISTS user_id ON conversation TYPE string;\n@@ -34,8 +34,8 @@\n DEFINE TABLE IF NOT EXISTS file SCHEMALESS;\n\n # Standard fields\n-DEFINE FIELD IF NOT EXISTS created_at ON file TYPE string;\n-DEFINE FIELD IF NOT EXISTS updated_at ON file TYPE string;\n+DEFINE FIELD IF NOT EXISTS created_at ON file TYPE datetime;\n+DEFINE FIELD IF NOT EXISTS updated_at ON file TYPE datetime;\n\n # Custom fields from the FileInfo struct\n DEFINE FIELD IF NOT EXISTS sha256 ON file TYPE string;\n@@ -54,8 +54,8 @@\n DEFINE TABLE IF NOT EXISTS ingestion_task SCHEMALESS;\n\n # Standard fields\n-DEFINE FIELD IF NOT EXISTS created_at ON ingestion_task TYPE string;\n-DEFINE FIELD IF NOT EXISTS updated_at ON ingestion_task TYPE string;\n+DEFINE FIELD IF NOT EXISTS created_at ON ingestion_task TYPE datetime;\n+DEFINE FIELD IF NOT EXISTS updated_at ON ingestion_task TYPE datetime;\n\n DEFINE FIELD IF NOT EXISTS content ON ingestion_task TYPE object;\n DEFINE FIELD IF NOT EXISTS status ON ingestion_task TYPE object;\n@@ -71,8 +71,8 @@\n DEFINE TABLE IF NOT EXISTS knowledge_entity SCHEMALESS;\n\n # Standard fields\n-DEFINE FIELD IF NOT EXISTS created_at ON knowledge_entity TYPE string;\n-DEFINE FIELD IF NOT EXISTS updated_at ON knowledge_entity TYPE string;\n+DEFINE FIELD IF NOT EXISTS created_at ON knowledge_entity TYPE datetime;\n+DEFINE FIELD IF NOT EXISTS updated_at ON knowledge_entity TYPE datetime;\n\n # Custom fields from the KnowledgeEntity struct\n DEFINE FIELD IF NOT EXISTS source_id ON knowledge_entity TYPE string;\n@@ -102,8 +102,8 @@\n DEFINE TABLE IF NOT EXISTS message SCHEMALESS;\n\n # Standard fields\n-DEFINE FIELD IF NOT EXISTS created_at ON message TYPE string;\n-DEFINE FIELD IF NOT EXISTS updated_at ON message TYPE string;\n+DEFINE FIELD IF NOT EXISTS created_at ON message TYPE datetime;\n+DEFINE FIELD IF NOT EXISTS updated_at ON message TYPE datetime;\n\n # Custom fields from the Message struct\n DEFINE FIELD IF NOT EXISTS conversation_id ON message TYPE string;\n@@ -167,8 +167,8 @@\n DEFINE TABLE IF NOT EXISTS text_chunk SCHEMALESS;\n\n # Standard fields\n-DEFINE FIELD IF NOT EXISTS created_at ON text_chunk TYPE string;\n-DEFINE FIELD IF NOT EXISTS updated_at ON text_chunk TYPE string;\n+DEFINE FIELD IF NOT EXISTS created_at ON text_chunk TYPE datetime;\n+DEFINE FIELD IF NOT EXISTS updated_at ON text_chunk TYPE datetime;\n\n # Custom fields from the TextChunk struct\n DEFINE FIELD IF NOT EXISTS source_id ON text_chunk TYPE string;\n@@ -191,8 +191,8 @@\n DEFINE TABLE IF NOT EXISTS text_content SCHEMALESS;\n\n # Standard fields\n-DEFINE FIELD IF NOT EXISTS created_at ON text_content TYPE string;\n-DEFINE FIELD IF NOT EXISTS updated_at ON text_content TYPE string;\n+DEFINE FIELD IF NOT EXISTS created_at ON text_content TYPE datetime;\n+DEFINE FIELD IF NOT EXISTS updated_at ON text_content TYPE datetime;\n\n # Custom fields from the TextContent struct\n DEFINE FIELD IF NOT EXISTS text ON text_content TYPE string;\n@@ -215,8 +215,8 @@\n DEFINE TABLE IF NOT EXISTS user SCHEMALESS;\n\n # Standard fields\n-DEFINE FIELD IF NOT EXISTS created_at ON user TYPE string;\n-DEFINE FIELD IF NOT EXISTS updated_at ON user TYPE string;\n+DEFINE FIELD IF NOT EXISTS created_at ON user TYPE datetime;\n+DEFINE FIELD IF NOT EXISTS updated_at ON user TYPE datetime;\n\n # Custom fields from the User struct\n DEFINE FIELD IF NOT EXISTS email ON user TYPE string;\n","events":null}
|
||||
File diff suppressed because one or more lines are too long
@@ -0,0 +1 @@
|
||||
{"schemas":"--- original\n+++ modified\n@@ -242,7 +242,7 @@\n\n # Defines the schema for the 'text_content' table.\n\n-DEFINE TABLE IF NOT EXISTS text_content SCHEMALESS;\n+DEFINE TABLE IF NOT EXISTS text_content SCHEMAFULL;\n\n # Standard fields\n DEFINE FIELD IF NOT EXISTS created_at ON text_content TYPE datetime;\n@@ -254,10 +254,24 @@\n DEFINE FIELD IF NOT EXISTS file_info ON text_content TYPE option<object>;\n # UrlInfo is a struct, store as object\n DEFINE FIELD IF NOT EXISTS url_info ON text_content TYPE option<object>;\n+DEFINE FIELD IF NOT EXISTS url_info.url ON text_content TYPE string;\n+DEFINE FIELD IF NOT EXISTS url_info.title ON text_content TYPE string;\n+DEFINE FIELD IF NOT EXISTS url_info.image_id ON text_content TYPE string;\n+\n DEFINE FIELD IF NOT EXISTS context ON text_content TYPE option<string>;\n DEFINE FIELD IF NOT EXISTS category ON text_content TYPE string;\n DEFINE FIELD IF NOT EXISTS user_id ON text_content TYPE string;\n\n+# FileInfo fields\n+DEFINE FIELD IF NOT EXISTS file_info.id ON text_content TYPE string;\n+DEFINE FIELD IF NOT EXISTS file_info.created_at ON text_content TYPE datetime;\n+DEFINE FIELD IF NOT EXISTS file_info.updated_at ON text_content TYPE datetime;\n+DEFINE FIELD IF NOT EXISTS file_info.sha256 ON text_content TYPE string;\n+DEFINE FIELD IF NOT EXISTS file_info.path ON text_content TYPE string;\n+DEFINE FIELD IF NOT EXISTS file_info.file_name ON text_content TYPE string;\n+DEFINE FIELD IF NOT EXISTS file_info.mime_type ON text_content TYPE string;\n+DEFINE FIELD IF NOT EXISTS file_info.user_id ON text_content TYPE string;\n+\n # Indexes based on query patterns\n DEFINE INDEX IF NOT EXISTS text_content_user_id_idx ON text_content FIELDS user_id;\n DEFINE INDEX IF NOT EXISTS text_content_created_at_idx ON text_content FIELDS created_at;\n","events":null}
|
||||
@@ -0,0 +1 @@
|
||||
{"schemas":"--- original\n+++ modified\n@@ -28,6 +28,7 @@\n # Add indexes based on query patterns (get_complete_conversation ownership check, get_user_conversations)\n DEFINE INDEX IF NOT EXISTS conversation_user_id_idx ON conversation FIELDS user_id;\n DEFINE INDEX IF NOT EXISTS conversation_created_at_idx ON conversation FIELDS created_at; # For get_user_conversations ORDER BY\n+DEFINE INDEX IF NOT EXISTS conversation_user_updated_at_idx ON conversation FIELDS user_id, updated_at; # For sidebar conversation projection ORDER BY\n\n # Defines the schema for the 'file' table (used by FileInfo).\n\n","events":null}
|
||||
File diff suppressed because one or more lines are too long
@@ -13,3 +13,4 @@ DEFINE FIELD IF NOT EXISTS title ON conversation TYPE string;
|
||||
# Add indexes based on query patterns (get_complete_conversation ownership check, get_user_conversations)
|
||||
DEFINE INDEX IF NOT EXISTS conversation_user_id_idx ON conversation FIELDS user_id;
|
||||
DEFINE INDEX IF NOT EXISTS conversation_created_at_idx ON conversation FIELDS created_at; # For get_user_conversations ORDER BY
|
||||
DEFINE INDEX IF NOT EXISTS conversation_user_updated_at_idx ON conversation FIELDS user_id, updated_at; # For sidebar conversation projection ORDER BY
|
||||
|
||||
@@ -15,16 +15,12 @@ DEFINE FIELD IF NOT EXISTS entity_type ON knowledge_entity TYPE string;
|
||||
# metadata is Option<serde_json::Value>, store as object
|
||||
DEFINE FIELD IF NOT EXISTS metadata ON knowledge_entity TYPE option<object>;
|
||||
|
||||
# Define embedding as a standard array of floats for schema definition
|
||||
DEFINE FIELD IF NOT EXISTS embedding ON knowledge_entity TYPE array<float>;
|
||||
# The specific vector nature is handled by the index definition below
|
||||
|
||||
DEFINE FIELD IF NOT EXISTS user_id ON knowledge_entity TYPE string;
|
||||
|
||||
# Indexes based on build_indexes and query patterns
|
||||
# The INDEX definition correctly specifies the vector properties
|
||||
DEFINE INDEX IF NOT EXISTS idx_embedding_entities ON knowledge_entity FIELDS embedding HNSW DIMENSION 1536;
|
||||
DEFINE INDEX IF NOT EXISTS knowledge_entity_user_id_idx ON knowledge_entity FIELDS user_id;
|
||||
-- Indexes based on build_indexes and query patterns
|
||||
-- HNSW index now defined on knowledge_entity_embedding table for better memory usage
|
||||
-- DEFINE INDEX IF NOT EXISTS idx_embedding_entities ON knowledge_entity FIELDS embedding HNSW DIMENSION 1536;
|
||||
DEFINE INDEX IF NOT EXISTS knowledge_entity_source_id_idx ON knowledge_entity FIELDS source_id;
|
||||
DEFINE INDEX IF NOT EXISTS knowledge_entity_user_id_idx ON knowledge_entity FIELDS user_id;
|
||||
DEFINE INDEX IF NOT EXISTS knowledge_entity_entity_type_idx ON knowledge_entity FIELDS entity_type;
|
||||
DEFINE INDEX IF NOT EXISTS knowledge_entity_created_at_idx ON knowledge_entity FIELDS created_at;
|
||||
|
||||
18
common/schemas/knowledge_entity_embedding.surql
Normal file
18
common/schemas/knowledge_entity_embedding.surql
Normal file
@@ -0,0 +1,18 @@
|
||||
-- Defines the schema for the 'knowledge_entity_embedding' table.
|
||||
-- Separate table to optimize HNSW index creation memory usage
|
||||
|
||||
DEFINE TABLE IF NOT EXISTS knowledge_entity_embedding SCHEMAFULL;
|
||||
|
||||
-- Standard fields
|
||||
DEFINE FIELD IF NOT EXISTS created_at ON knowledge_entity_embedding TYPE datetime;
|
||||
DEFINE FIELD IF NOT EXISTS updated_at ON knowledge_entity_embedding TYPE datetime;
|
||||
DEFINE FIELD IF NOT EXISTS user_id ON knowledge_entity_embedding TYPE string;
|
||||
|
||||
-- Custom fields
|
||||
DEFINE FIELD IF NOT EXISTS entity_id ON knowledge_entity_embedding TYPE record<knowledge_entity>;
|
||||
DEFINE FIELD IF NOT EXISTS embedding ON knowledge_entity_embedding TYPE array<float>;
|
||||
|
||||
-- Indexes
|
||||
-- DEFINE INDEX IF NOT EXISTS idx_embedding_knowledge_entity_embedding ON knowledge_entity_embedding FIELDS embedding HNSW DIMENSION 1536;
|
||||
DEFINE INDEX IF NOT EXISTS knowledge_entity_embedding_entity_id_idx ON knowledge_entity_embedding FIELDS entity_id;
|
||||
DEFINE INDEX IF NOT EXISTS knowledge_entity_embedding_user_id_idx ON knowledge_entity_embedding FIELDS user_id;
|
||||
23
common/schemas/scratchpad.surql
Normal file
23
common/schemas/scratchpad.surql
Normal file
@@ -0,0 +1,23 @@
|
||||
# Defines the schema for the 'scratchpad' table.
|
||||
|
||||
DEFINE TABLE IF NOT EXISTS scratchpad SCHEMALESS;
|
||||
|
||||
# Standard fields from stored_object! macro
|
||||
DEFINE FIELD IF NOT EXISTS created_at ON scratchpad TYPE datetime;
|
||||
DEFINE FIELD IF NOT EXISTS updated_at ON scratchpad TYPE datetime;
|
||||
|
||||
# Custom fields from the Scratchpad struct
|
||||
DEFINE FIELD IF NOT EXISTS user_id ON scratchpad TYPE string;
|
||||
DEFINE FIELD IF NOT EXISTS title ON scratchpad TYPE string;
|
||||
DEFINE FIELD IF NOT EXISTS content ON scratchpad TYPE string;
|
||||
DEFINE FIELD IF NOT EXISTS last_saved_at ON scratchpad TYPE datetime;
|
||||
DEFINE FIELD IF NOT EXISTS is_dirty ON scratchpad TYPE bool DEFAULT false;
|
||||
DEFINE FIELD IF NOT EXISTS is_archived ON scratchpad TYPE bool DEFAULT false;
|
||||
DEFINE FIELD IF NOT EXISTS archived_at ON scratchpad TYPE option<datetime>;
|
||||
DEFINE FIELD IF NOT EXISTS ingested_at ON scratchpad TYPE option<datetime>;
|
||||
|
||||
# Indexes based on query patterns
|
||||
DEFINE INDEX IF NOT EXISTS scratchpad_user_idx ON scratchpad FIELDS user_id;
|
||||
DEFINE INDEX IF NOT EXISTS scratchpad_user_archived_idx ON scratchpad FIELDS user_id, is_archived;
|
||||
DEFINE INDEX IF NOT EXISTS scratchpad_updated_idx ON scratchpad FIELDS updated_at;
|
||||
DEFINE INDEX IF NOT EXISTS scratchpad_archived_idx ON scratchpad FIELDS archived_at;
|
||||
@@ -10,14 +10,8 @@ DEFINE FIELD IF NOT EXISTS updated_at ON text_chunk TYPE datetime;
|
||||
DEFINE FIELD IF NOT EXISTS source_id ON text_chunk TYPE string;
|
||||
DEFINE FIELD IF NOT EXISTS chunk ON text_chunk TYPE string;
|
||||
|
||||
# Define embedding as a standard array of floats for schema definition
|
||||
DEFINE FIELD IF NOT EXISTS embedding ON text_chunk TYPE array<float>;
|
||||
# The specific vector nature is handled by the index definition below
|
||||
|
||||
DEFINE FIELD IF NOT EXISTS user_id ON text_chunk TYPE string;
|
||||
|
||||
# Indexes based on build_indexes and query patterns (delete_by_source_id)
|
||||
# The INDEX definition correctly specifies the vector properties
|
||||
DEFINE INDEX IF NOT EXISTS idx_embedding_chunks ON text_chunk FIELDS embedding HNSW DIMENSION 1536;
|
||||
DEFINE INDEX IF NOT EXISTS text_chunk_source_id_idx ON text_chunk FIELDS source_id;
|
||||
DEFINE INDEX IF NOT EXISTS text_chunk_user_id_idx ON text_chunk FIELDS user_id;
|
||||
|
||||
20
common/schemas/text_chunk_embedding.surql
Normal file
20
common/schemas/text_chunk_embedding.surql
Normal file
@@ -0,0 +1,20 @@
|
||||
-- Defines the schema for the 'text_chunk_embedding' table.
|
||||
-- Separate table to optimize HNSW index creation memory usage
|
||||
|
||||
DEFINE TABLE IF NOT EXISTS text_chunk_embedding SCHEMAFULL;
|
||||
|
||||
# Standard fields
|
||||
DEFINE FIELD IF NOT EXISTS created_at ON text_chunk_embedding TYPE datetime;
|
||||
DEFINE FIELD IF NOT EXISTS updated_at ON text_chunk_embedding TYPE datetime;
|
||||
DEFINE FIELD IF NOT EXISTS user_id ON text_chunk_embedding TYPE string;
|
||||
DEFINE FIELD IF NOT EXISTS source_id ON text_chunk_embedding TYPE string;
|
||||
|
||||
# Custom fields
|
||||
DEFINE FIELD IF NOT EXISTS chunk_id ON text_chunk_embedding TYPE record<text_chunk>;
|
||||
DEFINE FIELD IF NOT EXISTS embedding ON text_chunk_embedding TYPE array<float>;
|
||||
|
||||
-- Indexes
|
||||
-- DEFINE INDEX IF NOT EXISTS idx_embedding_text_chunk_embedding ON text_chunk_embedding FIELDS embedding HNSW DIMENSION 1536;
|
||||
DEFINE INDEX IF NOT EXISTS text_chunk_embedding_chunk_id_idx ON text_chunk_embedding FIELDS chunk_id;
|
||||
DEFINE INDEX IF NOT EXISTS text_chunk_embedding_user_id_idx ON text_chunk_embedding FIELDS user_id;
|
||||
DEFINE INDEX IF NOT EXISTS text_chunk_embedding_source_id_idx ON text_chunk_embedding FIELDS source_id;
|
||||
@@ -1,6 +1,6 @@
|
||||
# Defines the schema for the 'text_content' table.
|
||||
|
||||
DEFINE TABLE IF NOT EXISTS text_content SCHEMALESS;
|
||||
DEFINE TABLE IF NOT EXISTS text_content SCHEMAFULL;
|
||||
|
||||
# Standard fields
|
||||
DEFINE FIELD IF NOT EXISTS created_at ON text_content TYPE datetime;
|
||||
@@ -12,10 +12,24 @@ DEFINE FIELD IF NOT EXISTS text ON text_content TYPE string;
|
||||
DEFINE FIELD IF NOT EXISTS file_info ON text_content TYPE option<object>;
|
||||
# UrlInfo is a struct, store as object
|
||||
DEFINE FIELD IF NOT EXISTS url_info ON text_content TYPE option<object>;
|
||||
DEFINE FIELD IF NOT EXISTS url_info.url ON text_content TYPE string;
|
||||
DEFINE FIELD IF NOT EXISTS url_info.title ON text_content TYPE string;
|
||||
DEFINE FIELD IF NOT EXISTS url_info.image_id ON text_content TYPE string;
|
||||
|
||||
DEFINE FIELD IF NOT EXISTS context ON text_content TYPE option<string>;
|
||||
DEFINE FIELD IF NOT EXISTS category ON text_content TYPE string;
|
||||
DEFINE FIELD IF NOT EXISTS user_id ON text_content TYPE string;
|
||||
|
||||
# FileInfo fields
|
||||
DEFINE FIELD IF NOT EXISTS file_info.id ON text_content TYPE string;
|
||||
DEFINE FIELD IF NOT EXISTS file_info.created_at ON text_content TYPE datetime;
|
||||
DEFINE FIELD IF NOT EXISTS file_info.updated_at ON text_content TYPE datetime;
|
||||
DEFINE FIELD IF NOT EXISTS file_info.sha256 ON text_content TYPE string;
|
||||
DEFINE FIELD IF NOT EXISTS file_info.path ON text_content TYPE string;
|
||||
DEFINE FIELD IF NOT EXISTS file_info.file_name ON text_content TYPE string;
|
||||
DEFINE FIELD IF NOT EXISTS file_info.mime_type ON text_content TYPE string;
|
||||
DEFINE FIELD IF NOT EXISTS file_info.user_id ON text_content TYPE string;
|
||||
|
||||
# Indexes based on query patterns
|
||||
DEFINE INDEX IF NOT EXISTS text_content_user_id_idx ON text_content FIELDS user_id;
|
||||
DEFINE INDEX IF NOT EXISTS text_content_created_at_idx ON text_content FIELDS created_at;
|
||||
|
||||
@@ -5,6 +5,7 @@ use tokio::task::JoinError;
|
||||
use crate::storage::types::file_info::FileError;
|
||||
|
||||
// Core internal errors
|
||||
#[allow(clippy::module_name_repetitions)]
|
||||
#[derive(Error, Debug)]
|
||||
pub enum AppError {
|
||||
#[error("Database error: {0}")]
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
#![allow(clippy::doc_markdown)]
|
||||
//! Shared utilities and storage helpers for the workspace crates.
|
||||
pub mod error;
|
||||
pub mod storage;
|
||||
pub mod utils;
|
||||
|
||||
@@ -7,18 +7,20 @@ use include_dir::{include_dir, Dir};
|
||||
use std::{ops::Deref, sync::Arc};
|
||||
use surrealdb::{
|
||||
engine::any::{connect, Any},
|
||||
opt::auth::Root,
|
||||
opt::auth::{Namespace, Root},
|
||||
Error, Notification, Surreal,
|
||||
};
|
||||
use surrealdb_migrations::MigrationRunner;
|
||||
use tracing::debug;
|
||||
|
||||
/// Embedded SurrealDB migration directory packaged with the crate.
|
||||
static MIGRATIONS_DIR: Dir<'_> = include_dir!("$CARGO_MANIFEST_DIR/");
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct SurrealDbClient {
|
||||
pub client: Surreal<Any>,
|
||||
}
|
||||
#[allow(clippy::module_name_repetitions)]
|
||||
pub trait ProvidesDb {
|
||||
fn db(&self) -> &Arc<SurrealDbClient>;
|
||||
}
|
||||
@@ -48,6 +50,24 @@ impl SurrealDbClient {
|
||||
Ok(SurrealDbClient { client: db })
|
||||
}
|
||||
|
||||
pub async fn new_with_namespace_user(
|
||||
address: &str,
|
||||
namespace: &str,
|
||||
username: &str,
|
||||
password: &str,
|
||||
database: &str,
|
||||
) -> Result<Self, Error> {
|
||||
let db = connect(address).await?;
|
||||
db.signin(Namespace {
|
||||
namespace,
|
||||
username,
|
||||
password,
|
||||
})
|
||||
.await?;
|
||||
db.use_ns(namespace).use_db(database).await?;
|
||||
Ok(SurrealDbClient { client: db })
|
||||
}
|
||||
|
||||
pub async fn create_session_store(
|
||||
&self,
|
||||
) -> Result<SessionStore<SessionSurrealPool<Any>>, SessionError> {
|
||||
@@ -77,21 +97,6 @@ impl SurrealDbClient {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Operation to rebuild indexes
|
||||
pub async fn rebuild_indexes(&self) -> Result<(), Error> {
|
||||
debug!("Rebuilding indexes");
|
||||
self.client
|
||||
.query("REBUILD INDEX IF EXISTS idx_embedding_chunks ON text_chunk")
|
||||
.await?;
|
||||
self.client
|
||||
.query("REBUILD INDEX IF EXISTS idx_embedding_entities ON knowledge_entity")
|
||||
.await?;
|
||||
self.client
|
||||
.query("REBUILD INDEX IF EXISTS text_content_fts_idx ON text_content")
|
||||
.await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Operation to store a object in SurrealDB, requires the struct to implement StoredObject
|
||||
///
|
||||
/// # Arguments
|
||||
@@ -109,6 +114,19 @@ impl SurrealDbClient {
|
||||
.await
|
||||
}
|
||||
|
||||
/// Operation to upsert an object in SurrealDB, replacing any existing record
|
||||
/// with the same ID. Useful for idempotent ingestion flows.
|
||||
pub async fn upsert_item<T>(&self, item: T) -> Result<Option<T>, Error>
|
||||
where
|
||||
T: StoredObject + Send + Sync + 'static,
|
||||
{
|
||||
let id = item.get_id().to_string();
|
||||
self.client
|
||||
.upsert((T::table_name(), id))
|
||||
.content(item)
|
||||
.await
|
||||
}
|
||||
|
||||
/// Operation to retrieve all objects from a certain table, requires the struct to implement StoredObject
|
||||
///
|
||||
/// # Returns
|
||||
@@ -247,6 +265,56 @@ mod tests {
|
||||
assert!(fetch_post.is_none());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn upsert_item_overwrites_existing_records() {
|
||||
let namespace = "test_ns";
|
||||
let database = &Uuid::new_v4().to_string();
|
||||
let db = SurrealDbClient::memory(namespace, database)
|
||||
.await
|
||||
.expect("Failed to start in-memory surrealdb");
|
||||
|
||||
db.apply_migrations()
|
||||
.await
|
||||
.expect("Failed to initialize schema");
|
||||
|
||||
let mut dummy = Dummy {
|
||||
id: "abc".to_string(),
|
||||
name: "first".to_string(),
|
||||
created_at: Utc::now(),
|
||||
updated_at: Utc::now(),
|
||||
};
|
||||
|
||||
db.store_item(dummy.clone())
|
||||
.await
|
||||
.expect("Failed to store initial record");
|
||||
|
||||
dummy.name = "updated".to_string();
|
||||
let upserted = db
|
||||
.upsert_item(dummy.clone())
|
||||
.await
|
||||
.expect("Failed to upsert record");
|
||||
assert!(upserted.is_some());
|
||||
|
||||
let fetched: Option<Dummy> = db.get_item(&dummy.id).await.expect("fetch after upsert");
|
||||
assert_eq!(fetched.unwrap().name, "updated");
|
||||
|
||||
let new_record = Dummy {
|
||||
id: "def".to_string(),
|
||||
name: "brand-new".to_string(),
|
||||
created_at: Utc::now(),
|
||||
updated_at: Utc::now(),
|
||||
};
|
||||
db.upsert_item(new_record.clone())
|
||||
.await
|
||||
.expect("Failed to upsert new record");
|
||||
|
||||
let fetched_new: Option<Dummy> = db
|
||||
.get_item(&new_record.id)
|
||||
.await
|
||||
.expect("fetch inserted via upsert");
|
||||
assert_eq!(fetched_new, Some(new_record));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_applying_migrations() {
|
||||
let namespace = "test_ns";
|
||||
|
||||
795
common/src/storage/indexes.rs
Normal file
795
common/src/storage/indexes.rs
Normal file
@@ -0,0 +1,795 @@
|
||||
use std::time::Duration;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use futures::future::try_join_all;
|
||||
use serde::Deserialize;
|
||||
use serde_json::{Map, Value};
|
||||
use tracing::{debug, info, warn};
|
||||
|
||||
use crate::{error::AppError, storage::db::SurrealDbClient};
|
||||
|
||||
const INDEX_POLL_INTERVAL: Duration = Duration::from_millis(50);
|
||||
const FTS_ANALYZER_NAME: &str = "app_en_fts_analyzer";
|
||||
|
||||
#[derive(Clone, Copy)]
|
||||
struct HnswIndexSpec {
|
||||
index_name: &'static str,
|
||||
table: &'static str,
|
||||
options: &'static str,
|
||||
}
|
||||
|
||||
const fn hnsw_index_specs() -> [HnswIndexSpec; 2] {
|
||||
[
|
||||
HnswIndexSpec {
|
||||
index_name: "idx_embedding_text_chunk_embedding",
|
||||
table: "text_chunk_embedding",
|
||||
options: "DIST COSINE TYPE F32 EFC 100 M 8 CONCURRENTLY",
|
||||
},
|
||||
HnswIndexSpec {
|
||||
index_name: "idx_embedding_knowledge_entity_embedding",
|
||||
table: "knowledge_entity_embedding",
|
||||
options: "DIST COSINE TYPE F32 EFC 100 M 8 CONCURRENTLY",
|
||||
},
|
||||
]
|
||||
}
|
||||
|
||||
const fn fts_index_specs() -> [FtsIndexSpec; 8] {
|
||||
[
|
||||
FtsIndexSpec {
|
||||
index_name: "text_content_fts_idx",
|
||||
table: "text_content",
|
||||
field: "text",
|
||||
analyzer: Some(FTS_ANALYZER_NAME),
|
||||
method: "BM25",
|
||||
},
|
||||
FtsIndexSpec {
|
||||
index_name: "text_content_context_fts_idx",
|
||||
table: "text_content",
|
||||
field: "context",
|
||||
analyzer: Some(FTS_ANALYZER_NAME),
|
||||
method: "BM25",
|
||||
},
|
||||
FtsIndexSpec {
|
||||
index_name: "text_content_file_name_fts_idx",
|
||||
table: "text_content",
|
||||
field: "file_info.file_name",
|
||||
analyzer: Some(FTS_ANALYZER_NAME),
|
||||
method: "BM25",
|
||||
},
|
||||
FtsIndexSpec {
|
||||
index_name: "text_content_url_fts_idx",
|
||||
table: "text_content",
|
||||
field: "url_info.url",
|
||||
analyzer: Some(FTS_ANALYZER_NAME),
|
||||
method: "BM25",
|
||||
},
|
||||
FtsIndexSpec {
|
||||
index_name: "text_content_url_title_fts_idx",
|
||||
table: "text_content",
|
||||
field: "url_info.title",
|
||||
analyzer: Some(FTS_ANALYZER_NAME),
|
||||
method: "BM25",
|
||||
},
|
||||
FtsIndexSpec {
|
||||
index_name: "knowledge_entity_fts_name_idx",
|
||||
table: "knowledge_entity",
|
||||
field: "name",
|
||||
analyzer: Some(FTS_ANALYZER_NAME),
|
||||
method: "BM25",
|
||||
},
|
||||
FtsIndexSpec {
|
||||
index_name: "knowledge_entity_fts_description_idx",
|
||||
table: "knowledge_entity",
|
||||
field: "description",
|
||||
analyzer: Some(FTS_ANALYZER_NAME),
|
||||
method: "BM25",
|
||||
},
|
||||
FtsIndexSpec {
|
||||
index_name: "text_chunk_fts_chunk_idx",
|
||||
table: "text_chunk",
|
||||
field: "chunk",
|
||||
analyzer: Some(FTS_ANALYZER_NAME),
|
||||
method: "BM25",
|
||||
},
|
||||
]
|
||||
}
|
||||
|
||||
impl HnswIndexSpec {
|
||||
fn definition_if_not_exists(&self, dimension: usize) -> String {
|
||||
format!(
|
||||
"DEFINE INDEX IF NOT EXISTS {index} ON TABLE {table} \
|
||||
FIELDS embedding HNSW DIMENSION {dimension} {options};",
|
||||
index = self.index_name,
|
||||
table = self.table,
|
||||
dimension = dimension,
|
||||
options = self.options,
|
||||
)
|
||||
}
|
||||
|
||||
fn definition_overwrite(&self, dimension: usize) -> String {
|
||||
format!(
|
||||
"DEFINE INDEX OVERWRITE {index} ON TABLE {table} \
|
||||
FIELDS embedding HNSW DIMENSION {dimension} {options};",
|
||||
index = self.index_name,
|
||||
table = self.table,
|
||||
dimension = dimension,
|
||||
options = self.options,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy)]
|
||||
struct FtsIndexSpec {
|
||||
index_name: &'static str,
|
||||
table: &'static str,
|
||||
field: &'static str,
|
||||
analyzer: Option<&'static str>,
|
||||
method: &'static str,
|
||||
}
|
||||
|
||||
impl FtsIndexSpec {
|
||||
fn definition(&self) -> String {
|
||||
let analyzer_clause = self
|
||||
.analyzer
|
||||
.map(|analyzer| format!(" SEARCH ANALYZER {analyzer} {}", self.method))
|
||||
.unwrap_or_default();
|
||||
|
||||
format!(
|
||||
"DEFINE INDEX IF NOT EXISTS {index} ON TABLE {table} FIELDS {field}{analyzer_clause} CONCURRENTLY;",
|
||||
index = self.index_name,
|
||||
table = self.table,
|
||||
field = self.field,
|
||||
)
|
||||
}
|
||||
|
||||
fn overwrite_definition(&self) -> String {
|
||||
let analyzer_clause = self
|
||||
.analyzer
|
||||
.map(|analyzer| format!(" SEARCH ANALYZER {analyzer} {}", self.method))
|
||||
.unwrap_or_default();
|
||||
|
||||
format!(
|
||||
"DEFINE INDEX OVERWRITE {index} ON TABLE {table} FIELDS {field}{analyzer_clause} CONCURRENTLY;",
|
||||
index = self.index_name,
|
||||
table = self.table,
|
||||
field = self.field,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
/// Build runtime Surreal indexes (FTS + HNSW) using concurrent creation with readiness polling.
|
||||
/// Idempotent: safe to call multiple times and will overwrite HNSW definitions when the dimension changes.
|
||||
pub async fn ensure_runtime_indexes(
|
||||
db: &SurrealDbClient,
|
||||
embedding_dimension: usize,
|
||||
) -> Result<(), AppError> {
|
||||
ensure_runtime_indexes_inner(db, embedding_dimension)
|
||||
.await
|
||||
.map_err(|err| AppError::InternalError(err.to_string()))
|
||||
}
|
||||
|
||||
/// Rebuild known FTS and HNSW indexes, skipping any that are not yet defined.
|
||||
pub async fn rebuild_indexes(db: &SurrealDbClient) -> Result<(), AppError> {
|
||||
rebuild_indexes_inner(db)
|
||||
.await
|
||||
.map_err(|err| AppError::InternalError(err.to_string()))
|
||||
}
|
||||
|
||||
async fn ensure_runtime_indexes_inner(
|
||||
db: &SurrealDbClient,
|
||||
embedding_dimension: usize,
|
||||
) -> Result<()> {
|
||||
create_fts_analyzer(db).await?;
|
||||
|
||||
for spec in fts_index_specs() {
|
||||
if index_exists(db, spec.table, spec.index_name).await? {
|
||||
continue;
|
||||
}
|
||||
// We need to create these sequentially otherwise SurrealDB errors with read/write clash
|
||||
create_index_with_polling(
|
||||
db,
|
||||
spec.definition(),
|
||||
spec.index_name,
|
||||
spec.table,
|
||||
Some(spec.table),
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
|
||||
let hnsw_tasks = hnsw_index_specs().into_iter().map(|spec| async move {
|
||||
match hnsw_index_state(db, &spec, embedding_dimension).await? {
|
||||
HnswIndexState::Missing => {
|
||||
create_index_with_polling(
|
||||
db,
|
||||
spec.definition_if_not_exists(embedding_dimension),
|
||||
spec.index_name,
|
||||
spec.table,
|
||||
Some(spec.table),
|
||||
)
|
||||
.await
|
||||
}
|
||||
HnswIndexState::Matches => {
|
||||
let status = get_index_status(db, spec.index_name, spec.table).await?;
|
||||
if status.eq_ignore_ascii_case("error") {
|
||||
warn!(
|
||||
index = spec.index_name,
|
||||
table = spec.table,
|
||||
"HNSW index found in error state; triggering rebuild"
|
||||
);
|
||||
create_index_with_polling(
|
||||
db,
|
||||
spec.definition_overwrite(embedding_dimension),
|
||||
spec.index_name,
|
||||
spec.table,
|
||||
Some(spec.table),
|
||||
)
|
||||
.await
|
||||
} else {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
HnswIndexState::Different(existing) => {
|
||||
info!(
|
||||
index = spec.index_name,
|
||||
table = spec.table,
|
||||
existing_dimension = existing,
|
||||
target_dimension = embedding_dimension,
|
||||
"Overwriting HNSW index to match new embedding dimension"
|
||||
);
|
||||
create_index_with_polling(
|
||||
db,
|
||||
spec.definition_overwrite(embedding_dimension),
|
||||
spec.index_name,
|
||||
spec.table,
|
||||
Some(spec.table),
|
||||
)
|
||||
.await
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
try_join_all(hnsw_tasks).await.map(|_| ())?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn get_index_status(db: &SurrealDbClient, index_name: &str, table: &str) -> Result<String> {
|
||||
let info_query = format!("INFO FOR INDEX {index_name} ON TABLE {table};");
|
||||
let mut info_res = db
|
||||
.client
|
||||
.query(info_query)
|
||||
.await
|
||||
.context("checking index status")?;
|
||||
let info: Option<Value> = info_res.take(0).context("failed to take info result")?;
|
||||
|
||||
let info = match info {
|
||||
Some(i) => i,
|
||||
None => return Ok("unknown".to_string()),
|
||||
};
|
||||
|
||||
let building = info.get("building");
|
||||
let status = building
|
||||
.and_then(|b| b.get("status"))
|
||||
.and_then(|s| s.as_str())
|
||||
.unwrap_or("ready")
|
||||
.to_string();
|
||||
|
||||
Ok(status)
|
||||
}
|
||||
|
||||
async fn rebuild_indexes_inner(db: &SurrealDbClient) -> Result<()> {
|
||||
debug!("Rebuilding indexes with concurrent definitions");
|
||||
create_fts_analyzer(db).await?;
|
||||
|
||||
for spec in fts_index_specs() {
|
||||
if !index_exists(db, spec.table, spec.index_name).await? {
|
||||
debug!(
|
||||
index = spec.index_name,
|
||||
table = spec.table,
|
||||
"Skipping FTS rebuild because index is missing"
|
||||
);
|
||||
continue;
|
||||
}
|
||||
|
||||
create_index_with_polling(
|
||||
db,
|
||||
spec.overwrite_definition(),
|
||||
spec.index_name,
|
||||
spec.table,
|
||||
Some(spec.table),
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
|
||||
let hnsw_tasks = hnsw_index_specs().into_iter().map(|spec| async move {
|
||||
if !index_exists(db, spec.table, spec.index_name).await? {
|
||||
debug!(
|
||||
index = spec.index_name,
|
||||
table = spec.table,
|
||||
"Skipping HNSW rebuild because index is missing"
|
||||
);
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let Some(dimension) = existing_hnsw_dimension(db, &spec).await? else {
|
||||
warn!(
|
||||
index = spec.index_name,
|
||||
table = spec.table,
|
||||
"HNSW index missing dimension; skipping rebuild"
|
||||
);
|
||||
return Ok(());
|
||||
};
|
||||
|
||||
create_index_with_polling(
|
||||
db,
|
||||
spec.definition_overwrite(dimension),
|
||||
spec.index_name,
|
||||
spec.table,
|
||||
Some(spec.table),
|
||||
)
|
||||
.await
|
||||
});
|
||||
|
||||
try_join_all(hnsw_tasks).await.map(|_| ())
|
||||
}
|
||||
|
||||
async fn existing_hnsw_dimension(
|
||||
db: &SurrealDbClient,
|
||||
spec: &HnswIndexSpec,
|
||||
) -> Result<Option<usize>> {
|
||||
let Some(indexes) = table_index_definitions(db, spec.table).await? else {
|
||||
return Ok(None);
|
||||
};
|
||||
|
||||
let Some(definition) = indexes
|
||||
.get(spec.index_name)
|
||||
.and_then(|details| details.get("Strand"))
|
||||
.and_then(|v| v.as_str())
|
||||
else {
|
||||
return Ok(None);
|
||||
};
|
||||
|
||||
Ok(extract_dimension(definition).and_then(|d| usize::try_from(d).ok()))
|
||||
}
|
||||
|
||||
async fn hnsw_index_state(
|
||||
db: &SurrealDbClient,
|
||||
spec: &HnswIndexSpec,
|
||||
expected_dimension: usize,
|
||||
) -> Result<HnswIndexState> {
|
||||
match existing_hnsw_dimension(db, spec).await? {
|
||||
None => Ok(HnswIndexState::Missing),
|
||||
Some(current_dimension) if current_dimension == expected_dimension => {
|
||||
Ok(HnswIndexState::Matches)
|
||||
}
|
||||
Some(current_dimension) => Ok(HnswIndexState::Different(current_dimension as u64)),
|
||||
}
|
||||
}
|
||||
|
||||
enum HnswIndexState {
|
||||
Missing,
|
||||
Matches,
|
||||
Different(u64),
|
||||
}
|
||||
|
||||
fn extract_dimension(definition: &str) -> Option<u64> {
|
||||
definition
|
||||
.split("DIMENSION")
|
||||
.nth(1)
|
||||
.and_then(|rest| rest.split_whitespace().next())
|
||||
.and_then(|token| token.trim_end_matches(';').parse::<u64>().ok())
|
||||
}
|
||||
|
||||
async fn create_fts_analyzer(db: &SurrealDbClient) -> Result<()> {
|
||||
// Prefer snowball stemming when supported; fall back to ascii-only when the filter
|
||||
// is unavailable in the running Surreal build. Use IF NOT EXISTS to avoid clobbering
|
||||
// an existing analyzer definition.
|
||||
let snowball_query = format!(
|
||||
"DEFINE ANALYZER IF NOT EXISTS {analyzer}
|
||||
TOKENIZERS class
|
||||
FILTERS lowercase, ascii, snowball(english);",
|
||||
analyzer = FTS_ANALYZER_NAME
|
||||
);
|
||||
|
||||
match db.client.query(snowball_query).await {
|
||||
Ok(res) => {
|
||||
if res.check().is_ok() {
|
||||
return Ok(());
|
||||
}
|
||||
warn!(
|
||||
"Snowball analyzer check failed; attempting ascii fallback definition (analyzer: {})",
|
||||
FTS_ANALYZER_NAME
|
||||
);
|
||||
}
|
||||
Err(err) => {
|
||||
warn!(
|
||||
error = %err,
|
||||
"Snowball analyzer creation errored; attempting ascii fallback definition"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
let fallback_query = format!(
|
||||
"DEFINE ANALYZER IF NOT EXISTS {analyzer}
|
||||
TOKENIZERS class
|
||||
FILTERS lowercase, ascii;",
|
||||
analyzer = FTS_ANALYZER_NAME
|
||||
);
|
||||
|
||||
let res = db
|
||||
.client
|
||||
.query(fallback_query)
|
||||
.await
|
||||
.context("creating fallback FTS analyzer")?;
|
||||
|
||||
if let Err(err) = res.check() {
|
||||
warn!(
|
||||
error = %err,
|
||||
"Fallback analyzer creation failed; FTS will run without snowball/ascii analyzer ({})",
|
||||
FTS_ANALYZER_NAME
|
||||
);
|
||||
return Err(err).context("failed to create fallback FTS analyzer");
|
||||
}
|
||||
|
||||
warn!(
|
||||
"Snowball analyzer unavailable; using fallback analyzer ({}) with lowercase+ascii only",
|
||||
FTS_ANALYZER_NAME
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn create_index_with_polling(
|
||||
db: &SurrealDbClient,
|
||||
definition: String,
|
||||
index_name: &str,
|
||||
table: &str,
|
||||
progress_table: Option<&str>,
|
||||
) -> Result<()> {
|
||||
let expected_total = match progress_table {
|
||||
Some(table) => Some(count_table_rows(db, table).await.with_context(|| {
|
||||
format!("counting rows in {table} for index {index_name} progress")
|
||||
})?),
|
||||
None => None,
|
||||
};
|
||||
|
||||
let mut attempts = 0;
|
||||
const MAX_ATTEMPTS: usize = 3;
|
||||
loop {
|
||||
attempts += 1;
|
||||
let res = db
|
||||
.client
|
||||
.query(definition.clone())
|
||||
.await
|
||||
.with_context(|| format!("creating index {index_name} on table {table}"))?;
|
||||
match res.check() {
|
||||
Ok(_) => break,
|
||||
Err(err) => {
|
||||
let msg = err.to_string();
|
||||
let conflict = msg.contains("read or write conflict");
|
||||
warn!(
|
||||
index = %index_name,
|
||||
table = %table,
|
||||
error = ?err,
|
||||
attempt = attempts,
|
||||
definition = %definition,
|
||||
"Index definition failed"
|
||||
);
|
||||
if conflict && attempts < MAX_ATTEMPTS {
|
||||
tokio::time::sleep(Duration::from_millis(100)).await;
|
||||
continue;
|
||||
}
|
||||
return Err(err).with_context(|| {
|
||||
format!("index definition failed for {index_name} on {table}")
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
debug!(
|
||||
index = %index_name,
|
||||
table = %table,
|
||||
expected_rows = ?expected_total,
|
||||
"Index definition submitted; waiting for build to finish"
|
||||
);
|
||||
|
||||
poll_index_build_status(db, index_name, table, expected_total, INDEX_POLL_INTERVAL).await
|
||||
}
|
||||
|
||||
async fn poll_index_build_status(
|
||||
db: &SurrealDbClient,
|
||||
index_name: &str,
|
||||
table: &str,
|
||||
total_rows: Option<u64>,
|
||||
poll_every: Duration,
|
||||
) -> Result<()> {
|
||||
let started_at = std::time::Instant::now();
|
||||
|
||||
loop {
|
||||
tokio::time::sleep(poll_every).await;
|
||||
|
||||
let info_query = format!("INFO FOR INDEX {index_name} ON TABLE {table};");
|
||||
let mut info_res =
|
||||
db.client.query(info_query).await.with_context(|| {
|
||||
format!("checking index build status for {index_name} on {table}")
|
||||
})?;
|
||||
|
||||
let info: Option<Value> = info_res
|
||||
.take(0)
|
||||
.context("failed to deserialize INFO FOR INDEX result")?;
|
||||
|
||||
let Some(snapshot) = parse_index_build_info(info, total_rows) else {
|
||||
warn!(
|
||||
index = %index_name,
|
||||
table = %table,
|
||||
"INFO FOR INDEX returned no data; assuming index definition might be missing"
|
||||
);
|
||||
break;
|
||||
};
|
||||
|
||||
match snapshot.progress_pct {
|
||||
Some(pct) => debug!(
|
||||
index = %index_name,
|
||||
table = %table,
|
||||
status = snapshot.status,
|
||||
initial = snapshot.initial,
|
||||
pending = snapshot.pending,
|
||||
updated = snapshot.updated,
|
||||
processed = snapshot.processed,
|
||||
total = snapshot.total_rows,
|
||||
progress_pct = format_args!("{pct:.1}"),
|
||||
"Index build status"
|
||||
),
|
||||
None => debug!(
|
||||
index = %index_name,
|
||||
table = %table,
|
||||
status = snapshot.status,
|
||||
initial = snapshot.initial,
|
||||
pending = snapshot.pending,
|
||||
updated = snapshot.updated,
|
||||
processed = snapshot.processed,
|
||||
"Index build status"
|
||||
),
|
||||
}
|
||||
|
||||
if snapshot.is_ready() {
|
||||
debug!(
|
||||
index = %index_name,
|
||||
table = %table,
|
||||
elapsed = ?started_at.elapsed(),
|
||||
processed = snapshot.processed,
|
||||
total = snapshot.total_rows,
|
||||
"Index is ready"
|
||||
);
|
||||
break;
|
||||
}
|
||||
|
||||
if snapshot.status.eq_ignore_ascii_case("error") {
|
||||
warn!(
|
||||
index = %index_name,
|
||||
table = %table,
|
||||
status = snapshot.status,
|
||||
"Index build reported error status; stopping polling"
|
||||
);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq)]
|
||||
struct IndexBuildSnapshot {
|
||||
status: String,
|
||||
initial: u64,
|
||||
pending: u64,
|
||||
updated: u64,
|
||||
processed: u64,
|
||||
total_rows: Option<u64>,
|
||||
progress_pct: Option<f64>,
|
||||
}
|
||||
|
||||
impl IndexBuildSnapshot {
|
||||
fn is_ready(&self) -> bool {
|
||||
self.status.eq_ignore_ascii_case("ready")
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_index_build_info(
|
||||
info: Option<Value>,
|
||||
total_rows: Option<u64>,
|
||||
) -> Option<IndexBuildSnapshot> {
|
||||
let info = info?;
|
||||
let building = info.get("building");
|
||||
|
||||
let status = building
|
||||
.and_then(|b| b.get("status"))
|
||||
.and_then(|s| s.as_str())
|
||||
// If there's no `building` block at all, treat as "ready" (index not building anymore)
|
||||
.unwrap_or("ready")
|
||||
.to_string();
|
||||
|
||||
let initial = building
|
||||
.and_then(|b| b.get("initial"))
|
||||
.and_then(|v| v.as_u64())
|
||||
.unwrap_or(0);
|
||||
|
||||
let pending = building
|
||||
.and_then(|b| b.get("pending"))
|
||||
.and_then(|v| v.as_u64())
|
||||
.unwrap_or(0);
|
||||
|
||||
let updated = building
|
||||
.and_then(|b| b.get("updated"))
|
||||
.and_then(|v| v.as_u64())
|
||||
.unwrap_or(0);
|
||||
|
||||
// `initial` is the number of rows seen when the build started; `updated` accounts for later writes.
|
||||
let processed = initial.saturating_add(updated);
|
||||
|
||||
let progress_pct = total_rows.map(|total| {
|
||||
if total == 0 {
|
||||
0.0
|
||||
} else {
|
||||
((processed as f64 / total as f64).min(1.0)) * 100.0
|
||||
}
|
||||
});
|
||||
|
||||
Some(IndexBuildSnapshot {
|
||||
status,
|
||||
initial,
|
||||
pending,
|
||||
updated,
|
||||
processed,
|
||||
total_rows,
|
||||
progress_pct,
|
||||
})
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
struct CountRow {
|
||||
count: u64,
|
||||
}
|
||||
|
||||
async fn count_table_rows(db: &SurrealDbClient, table: &str) -> Result<u64> {
|
||||
let query = format!("SELECT count() AS count FROM {table} GROUP ALL;");
|
||||
let mut response = db
|
||||
.client
|
||||
.query(query)
|
||||
.await
|
||||
.with_context(|| format!("counting rows in {table}"))?;
|
||||
let rows: Vec<CountRow> = response
|
||||
.take(0)
|
||||
.context("failed to deserialize count() response")?;
|
||||
Ok(rows.first().map_or(0, |r| r.count))
|
||||
}
|
||||
|
||||
async fn table_index_definitions(
|
||||
db: &SurrealDbClient,
|
||||
table: &str,
|
||||
) -> Result<Option<Map<String, Value>>> {
|
||||
let info_query = format!("INFO FOR TABLE {table};");
|
||||
let mut response = db
|
||||
.client
|
||||
.query(info_query)
|
||||
.await
|
||||
.with_context(|| format!("fetching table info for {}", table))?;
|
||||
|
||||
let info: surrealdb::Value = response
|
||||
.take(0)
|
||||
.context("failed to take table info response")?;
|
||||
|
||||
let info_json: Value =
|
||||
serde_json::to_value(info).context("serializing table info to JSON for parsing")?;
|
||||
|
||||
Ok(info_json
|
||||
.get("Object")
|
||||
.and_then(|o| o.get("indexes"))
|
||||
.and_then(|i| i.get("Object"))
|
||||
.and_then(|i| i.as_object())
|
||||
.cloned())
|
||||
}
|
||||
|
||||
async fn index_exists(db: &SurrealDbClient, table: &str, index_name: &str) -> Result<bool> {
|
||||
let Some(indexes) = table_index_definitions(db, table).await? else {
|
||||
return Ok(false);
|
||||
};
|
||||
|
||||
Ok(indexes.contains_key(index_name))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use serde_json::json;
|
||||
use uuid::Uuid;
|
||||
|
||||
#[test]
|
||||
fn parse_index_build_info_reports_progress() {
|
||||
let info = json!({
|
||||
"building": {
|
||||
"initial": 56894,
|
||||
"pending": 0,
|
||||
"status": "indexing",
|
||||
"updated": 0
|
||||
}
|
||||
});
|
||||
|
||||
let snapshot = parse_index_build_info(Some(info), Some(61081)).expect("snapshot");
|
||||
assert_eq!(
|
||||
snapshot,
|
||||
IndexBuildSnapshot {
|
||||
status: "indexing".to_string(),
|
||||
initial: 56894,
|
||||
pending: 0,
|
||||
updated: 0,
|
||||
processed: 56894,
|
||||
total_rows: Some(61081),
|
||||
progress_pct: Some((56894_f64 / 61081_f64) * 100.0),
|
||||
}
|
||||
);
|
||||
assert!(!snapshot.is_ready());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_index_build_info_defaults_to_ready_when_no_building_block() {
|
||||
// Surreal returns `{}` when the index exists but isn't building.
|
||||
let info = json!({});
|
||||
let snapshot = parse_index_build_info(Some(info), Some(10)).expect("snapshot");
|
||||
assert!(snapshot.is_ready());
|
||||
assert_eq!(snapshot.processed, 0);
|
||||
assert_eq!(snapshot.progress_pct, Some(0.0));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn extract_dimension_parses_value() {
|
||||
let definition = "DEFINE INDEX idx_embedding_text_chunk_embedding ON TABLE text_chunk_embedding FIELDS embedding HNSW DIMENSION 1536 DIST COSINE TYPE F32 EFC 100 M 8;";
|
||||
assert_eq!(extract_dimension(definition), Some(1536));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn ensure_runtime_indexes_is_idempotent() {
|
||||
let namespace = "indexes_ns";
|
||||
let database = &Uuid::new_v4().to_string();
|
||||
let db = SurrealDbClient::memory(namespace, database)
|
||||
.await
|
||||
.expect("in-memory db");
|
||||
|
||||
db.apply_migrations()
|
||||
.await
|
||||
.expect("migrations should succeed");
|
||||
|
||||
// First run creates everything
|
||||
ensure_runtime_indexes(&db, 1536)
|
||||
.await
|
||||
.expect("initial index creation");
|
||||
|
||||
// Second run should be a no-op and still succeed
|
||||
ensure_runtime_indexes(&db, 1536)
|
||||
.await
|
||||
.expect("second index creation");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn ensure_hnsw_index_overwrites_dimension() {
|
||||
let namespace = "indexes_dim";
|
||||
let database = &Uuid::new_v4().to_string();
|
||||
let db = SurrealDbClient::memory(namespace, database)
|
||||
.await
|
||||
.expect("in-memory db");
|
||||
|
||||
db.apply_migrations()
|
||||
.await
|
||||
.expect("migrations should succeed");
|
||||
|
||||
// Create initial index with default dimension
|
||||
ensure_runtime_indexes(&db, 1536)
|
||||
.await
|
||||
.expect("initial index creation");
|
||||
|
||||
// Change dimension and ensure overwrite path is exercised
|
||||
ensure_runtime_indexes(&db, 128)
|
||||
.await
|
||||
.expect("overwritten index creation");
|
||||
}
|
||||
}
|
||||
@@ -1,3 +1,4 @@
|
||||
pub mod db;
|
||||
pub mod indexes;
|
||||
pub mod store;
|
||||
pub mod types;
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -71,6 +71,7 @@ impl Analytics {
|
||||
// We need to use a direct query for COUNT aggregation
|
||||
#[derive(Debug, Deserialize)]
|
||||
struct CountResult {
|
||||
/// Total user count.
|
||||
count: i64,
|
||||
}
|
||||
|
||||
@@ -81,7 +82,7 @@ impl Analytics {
|
||||
.await?
|
||||
.take(0)?;
|
||||
|
||||
Ok(result.map(|r| r.count).unwrap_or(0))
|
||||
Ok(result.map_or(0, |r| r.count))
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -10,6 +10,54 @@ stored_object!(Conversation, "conversation", {
|
||||
title: String
|
||||
});
|
||||
|
||||
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize, PartialEq, Eq)]
|
||||
pub struct SidebarConversation {
|
||||
#[serde(deserialize_with = "deserialize_sidebar_id")]
|
||||
pub id: String,
|
||||
pub title: String,
|
||||
}
|
||||
|
||||
struct SidebarIdVisitor;
|
||||
|
||||
impl<'de> serde::de::Visitor<'de> for SidebarIdVisitor {
|
||||
type Value = String;
|
||||
|
||||
fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||
formatter.write_str("a string id or a SurrealDB Thing")
|
||||
}
|
||||
|
||||
fn visit_str<E>(self, value: &str) -> Result<Self::Value, E>
|
||||
where
|
||||
E: serde::de::Error,
|
||||
{
|
||||
Ok(value.to_string())
|
||||
}
|
||||
|
||||
fn visit_string<E>(self, value: String) -> Result<Self::Value, E>
|
||||
where
|
||||
E: serde::de::Error,
|
||||
{
|
||||
Ok(value)
|
||||
}
|
||||
|
||||
fn visit_map<A>(self, map: A) -> Result<Self::Value, A::Error>
|
||||
where
|
||||
A: serde::de::MapAccess<'de>,
|
||||
{
|
||||
let thing = <surrealdb::sql::Thing as serde::Deserialize>::deserialize(
|
||||
serde::de::value::MapAccessDeserializer::new(map),
|
||||
)?;
|
||||
Ok(thing.id.to_raw())
|
||||
}
|
||||
}
|
||||
|
||||
fn deserialize_sidebar_id<'de, D>(deserializer: D) -> Result<String, D::Error>
|
||||
where
|
||||
D: serde::Deserializer<'de>,
|
||||
{
|
||||
deserializer.deserialize_any(SidebarIdVisitor)
|
||||
}
|
||||
|
||||
impl Conversation {
|
||||
pub fn new(user_id: String, title: String) -> Self {
|
||||
let now = Utc::now();
|
||||
@@ -75,6 +123,23 @@ impl Conversation {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn get_user_sidebar_conversations(
|
||||
user_id: &str,
|
||||
db: &SurrealDbClient,
|
||||
) -> Result<Vec<SidebarConversation>, AppError> {
|
||||
let conversations: Vec<SidebarConversation> = db
|
||||
.client
|
||||
.query(
|
||||
"SELECT id, title, updated_at FROM type::table($table_name) WHERE user_id = $user_id ORDER BY updated_at DESC",
|
||||
)
|
||||
.bind(("table_name", Self::table_name()))
|
||||
.bind(("user_id", user_id.to_string()))
|
||||
.await?
|
||||
.take(0)?;
|
||||
|
||||
Ok(conversations)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -249,6 +314,96 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_get_user_sidebar_conversations_filters_and_orders_by_updated_at_desc() {
|
||||
let namespace = "test_ns";
|
||||
let database = &Uuid::new_v4().to_string();
|
||||
let db = SurrealDbClient::memory(namespace, database)
|
||||
.await
|
||||
.expect("Failed to start in-memory surrealdb");
|
||||
|
||||
let user_id = "sidebar_user";
|
||||
let other_user_id = "other_user";
|
||||
let base = Utc::now();
|
||||
|
||||
let mut oldest = Conversation::new(user_id.to_string(), "Oldest".to_string());
|
||||
oldest.updated_at = base - chrono::Duration::minutes(30);
|
||||
|
||||
let mut newest = Conversation::new(user_id.to_string(), "Newest".to_string());
|
||||
newest.updated_at = base - chrono::Duration::minutes(5);
|
||||
|
||||
let mut middle = Conversation::new(user_id.to_string(), "Middle".to_string());
|
||||
middle.updated_at = base - chrono::Duration::minutes(15);
|
||||
|
||||
let mut other_user = Conversation::new(other_user_id.to_string(), "Other".to_string());
|
||||
other_user.updated_at = base;
|
||||
|
||||
db.store_item(oldest.clone())
|
||||
.await
|
||||
.expect("Failed to store oldest conversation");
|
||||
db.store_item(newest.clone())
|
||||
.await
|
||||
.expect("Failed to store newest conversation");
|
||||
db.store_item(middle.clone())
|
||||
.await
|
||||
.expect("Failed to store middle conversation");
|
||||
db.store_item(other_user)
|
||||
.await
|
||||
.expect("Failed to store other-user conversation");
|
||||
|
||||
let sidebar_items = Conversation::get_user_sidebar_conversations(user_id, &db)
|
||||
.await
|
||||
.expect("Failed to get sidebar conversations");
|
||||
|
||||
assert_eq!(sidebar_items.len(), 3);
|
||||
assert_eq!(sidebar_items[0].id, newest.id);
|
||||
assert_eq!(sidebar_items[0].title, "Newest");
|
||||
assert_eq!(sidebar_items[1].id, middle.id);
|
||||
assert_eq!(sidebar_items[1].title, "Middle");
|
||||
assert_eq!(sidebar_items[2].id, oldest.id);
|
||||
assert_eq!(sidebar_items[2].title, "Oldest");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_sidebar_projection_reflects_patch_title_and_updated_at_reorder() {
|
||||
let namespace = "test_ns";
|
||||
let database = &Uuid::new_v4().to_string();
|
||||
let db = SurrealDbClient::memory(namespace, database)
|
||||
.await
|
||||
.expect("Failed to start in-memory surrealdb");
|
||||
|
||||
let user_id = "sidebar_patch_user";
|
||||
let base = Utc::now();
|
||||
|
||||
let mut first = Conversation::new(user_id.to_string(), "First".to_string());
|
||||
first.updated_at = base - chrono::Duration::minutes(20);
|
||||
|
||||
let mut second = Conversation::new(user_id.to_string(), "Second".to_string());
|
||||
second.updated_at = base - chrono::Duration::minutes(10);
|
||||
|
||||
db.store_item(first.clone())
|
||||
.await
|
||||
.expect("Failed to store first conversation");
|
||||
db.store_item(second.clone())
|
||||
.await
|
||||
.expect("Failed to store second conversation");
|
||||
|
||||
let before_patch = Conversation::get_user_sidebar_conversations(user_id, &db)
|
||||
.await
|
||||
.expect("Failed to get sidebar conversations before patch");
|
||||
assert_eq!(before_patch[0].id, second.id);
|
||||
|
||||
Conversation::patch_title(&first.id, user_id, "First (renamed)", &db)
|
||||
.await
|
||||
.expect("Failed to patch conversation title");
|
||||
|
||||
let after_patch = Conversation::get_user_sidebar_conversations(user_id, &db)
|
||||
.await
|
||||
.expect("Failed to get sidebar conversations after patch");
|
||||
assert_eq!(after_patch[0].id, first.id);
|
||||
assert_eq!(after_patch[0].title, "First (renamed)");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_get_complete_conversation_with_messages() {
|
||||
// Setup in-memory database for testing
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,3 +1,9 @@
|
||||
#![allow(
|
||||
clippy::result_large_err,
|
||||
clippy::needless_pass_by_value,
|
||||
clippy::implicit_clone,
|
||||
clippy::semicolon_if_nothing_returned
|
||||
)]
|
||||
use crate::{error::AppError, storage::types::file_info::FileInfo};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tracing::info;
|
||||
@@ -38,6 +44,7 @@ impl IngestionPayload {
|
||||
/// # Returns
|
||||
/// * `Result<Vec<IngestionPayload>, AppError>` - On success, returns a vector of ingress objects
|
||||
/// (one per file/content type). On failure, returns an `AppError`.
|
||||
#[allow(clippy::similar_names)]
|
||||
pub fn create_ingestion_payload(
|
||||
content: Option<String>,
|
||||
context: String,
|
||||
|
||||
@@ -1,116 +1,538 @@
|
||||
use futures::Stream;
|
||||
use surrealdb::{opt::PatchOp, Notification};
|
||||
#![allow(
|
||||
clippy::cast_possible_wrap,
|
||||
clippy::items_after_statements,
|
||||
clippy::arithmetic_side_effects,
|
||||
clippy::cast_sign_loss,
|
||||
clippy::missing_docs_in_private_items,
|
||||
clippy::trivially_copy_pass_by_ref,
|
||||
clippy::expect_used
|
||||
)]
|
||||
use std::time::Duration;
|
||||
|
||||
use chrono::Duration as ChronoDuration;
|
||||
use state_machines::state_machine;
|
||||
use surrealdb::sql::Datetime as SurrealDatetime;
|
||||
use uuid::Uuid;
|
||||
|
||||
use crate::{error::AppError, storage::db::SurrealDbClient, stored_object};
|
||||
|
||||
use super::ingestion_payload::IngestionPayload;
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
|
||||
#[serde(tag = "name")]
|
||||
pub enum IngestionTaskStatus {
|
||||
Created,
|
||||
InProgress {
|
||||
attempts: u32,
|
||||
last_attempt: DateTime<Utc>,
|
||||
},
|
||||
Completed,
|
||||
Error {
|
||||
message: String,
|
||||
},
|
||||
pub const MAX_ATTEMPTS: u32 = 3;
|
||||
pub const DEFAULT_LEASE_SECS: i64 = 300;
|
||||
pub const DEFAULT_PRIORITY: i32 = 0;
|
||||
|
||||
#[derive(Debug, Default, Clone, serde::Serialize, serde::Deserialize, PartialEq, Eq)]
|
||||
pub enum TaskState {
|
||||
#[serde(rename = "Pending")]
|
||||
#[default]
|
||||
Pending,
|
||||
#[serde(rename = "Reserved")]
|
||||
Reserved,
|
||||
#[serde(rename = "Processing")]
|
||||
Processing,
|
||||
#[serde(rename = "Succeeded")]
|
||||
Succeeded,
|
||||
#[serde(rename = "Failed")]
|
||||
Failed,
|
||||
#[serde(rename = "Cancelled")]
|
||||
Cancelled,
|
||||
#[serde(rename = "DeadLetter")]
|
||||
DeadLetter,
|
||||
}
|
||||
|
||||
impl TaskState {
|
||||
pub fn as_str(&self) -> &'static str {
|
||||
match self {
|
||||
TaskState::Pending => "Pending",
|
||||
TaskState::Reserved => "Reserved",
|
||||
TaskState::Processing => "Processing",
|
||||
TaskState::Succeeded => "Succeeded",
|
||||
TaskState::Failed => "Failed",
|
||||
TaskState::Cancelled => "Cancelled",
|
||||
TaskState::DeadLetter => "DeadLetter",
|
||||
}
|
||||
}
|
||||
|
||||
pub fn is_terminal(&self) -> bool {
|
||||
matches!(
|
||||
self,
|
||||
TaskState::Succeeded | TaskState::Cancelled | TaskState::DeadLetter
|
||||
)
|
||||
}
|
||||
|
||||
pub fn display_label(&self) -> &'static str {
|
||||
match self {
|
||||
TaskState::Pending => "Pending",
|
||||
TaskState::Reserved => "Reserved",
|
||||
TaskState::Processing => "Processing",
|
||||
TaskState::Succeeded => "Completed",
|
||||
TaskState::Failed => "Retrying",
|
||||
TaskState::Cancelled => "Cancelled",
|
||||
TaskState::DeadLetter => "Dead Letter",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize, PartialEq, Eq, Default)]
|
||||
pub struct TaskErrorInfo {
|
||||
pub code: Option<String>,
|
||||
pub message: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
enum TaskTransition {
|
||||
StartProcessing,
|
||||
Succeed,
|
||||
Fail,
|
||||
Cancel,
|
||||
DeadLetter,
|
||||
Release,
|
||||
}
|
||||
|
||||
impl TaskTransition {
|
||||
fn as_str(&self) -> &'static str {
|
||||
match self {
|
||||
TaskTransition::StartProcessing => "start_processing",
|
||||
TaskTransition::Succeed => "succeed",
|
||||
TaskTransition::Fail => "fail",
|
||||
TaskTransition::Cancel => "cancel",
|
||||
TaskTransition::DeadLetter => "deadletter",
|
||||
TaskTransition::Release => "release",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
mod lifecycle {
|
||||
use super::state_machine;
|
||||
|
||||
state_machine! {
|
||||
name: TaskLifecycleMachine,
|
||||
initial: Pending,
|
||||
states: [Pending, Reserved, Processing, Succeeded, Failed, Cancelled, DeadLetter],
|
||||
events {
|
||||
reserve {
|
||||
transition: { from: Pending, to: Reserved }
|
||||
transition: { from: Failed, to: Reserved }
|
||||
}
|
||||
start_processing {
|
||||
transition: { from: Reserved, to: Processing }
|
||||
}
|
||||
succeed {
|
||||
transition: { from: Processing, to: Succeeded }
|
||||
}
|
||||
fail {
|
||||
transition: { from: Processing, to: Failed }
|
||||
}
|
||||
cancel {
|
||||
transition: { from: Pending, to: Cancelled }
|
||||
transition: { from: Reserved, to: Cancelled }
|
||||
transition: { from: Processing, to: Cancelled }
|
||||
}
|
||||
deadletter {
|
||||
transition: { from: Failed, to: DeadLetter }
|
||||
}
|
||||
release {
|
||||
transition: { from: Reserved, to: Pending }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(super) fn pending() -> TaskLifecycleMachine<(), Pending> {
|
||||
TaskLifecycleMachine::new(())
|
||||
}
|
||||
|
||||
pub(super) fn reserved() -> TaskLifecycleMachine<(), Reserved> {
|
||||
pending()
|
||||
.reserve()
|
||||
.expect("reserve transition from Pending should exist")
|
||||
}
|
||||
|
||||
pub(super) fn processing() -> TaskLifecycleMachine<(), Processing> {
|
||||
reserved()
|
||||
.start_processing()
|
||||
.expect("start_processing transition from Reserved should exist")
|
||||
}
|
||||
|
||||
pub(super) fn failed() -> TaskLifecycleMachine<(), Failed> {
|
||||
processing()
|
||||
.fail()
|
||||
.expect("fail transition from Processing should exist")
|
||||
}
|
||||
}
|
||||
|
||||
fn invalid_transition(state: &TaskState, event: TaskTransition) -> AppError {
|
||||
AppError::Validation(format!(
|
||||
"Invalid task transition: {} -> {}",
|
||||
state.as_str(),
|
||||
event.as_str()
|
||||
))
|
||||
}
|
||||
|
||||
stored_object!(IngestionTask, "ingestion_task", {
|
||||
content: IngestionPayload,
|
||||
status: IngestionTaskStatus,
|
||||
user_id: String
|
||||
state: TaskState,
|
||||
user_id: String,
|
||||
attempts: u32,
|
||||
max_attempts: u32,
|
||||
#[serde(serialize_with = "serialize_datetime", deserialize_with = "deserialize_datetime")]
|
||||
scheduled_at: chrono::DateTime<chrono::Utc>,
|
||||
#[serde(
|
||||
serialize_with = "serialize_option_datetime",
|
||||
deserialize_with = "deserialize_option_datetime",
|
||||
default
|
||||
)]
|
||||
locked_at: Option<chrono::DateTime<chrono::Utc>>,
|
||||
lease_duration_secs: i64,
|
||||
worker_id: Option<String>,
|
||||
error_code: Option<String>,
|
||||
error_message: Option<String>,
|
||||
#[serde(
|
||||
serialize_with = "serialize_option_datetime",
|
||||
deserialize_with = "deserialize_option_datetime",
|
||||
default
|
||||
)]
|
||||
last_error_at: Option<chrono::DateTime<chrono::Utc>>,
|
||||
priority: i32
|
||||
});
|
||||
|
||||
pub const MAX_ATTEMPTS: u32 = 3;
|
||||
|
||||
impl IngestionTask {
|
||||
pub async fn new(content: IngestionPayload, user_id: String) -> Self {
|
||||
let now = Utc::now();
|
||||
pub fn new(content: IngestionPayload, user_id: String) -> Self {
|
||||
let now = chrono::Utc::now();
|
||||
|
||||
Self {
|
||||
id: Uuid::new_v4().to_string(),
|
||||
content,
|
||||
status: IngestionTaskStatus::Created,
|
||||
state: TaskState::Pending,
|
||||
user_id,
|
||||
attempts: 0,
|
||||
max_attempts: MAX_ATTEMPTS,
|
||||
scheduled_at: now,
|
||||
locked_at: None,
|
||||
lease_duration_secs: DEFAULT_LEASE_SECS,
|
||||
worker_id: None,
|
||||
error_code: None,
|
||||
error_message: None,
|
||||
last_error_at: None,
|
||||
priority: DEFAULT_PRIORITY,
|
||||
created_at: now,
|
||||
updated_at: now,
|
||||
user_id,
|
||||
}
|
||||
}
|
||||
|
||||
/// Creates a new job and stores it in the database
|
||||
pub fn can_retry(&self) -> bool {
|
||||
self.attempts < self.max_attempts
|
||||
}
|
||||
|
||||
pub fn lease_duration(&self) -> Duration {
|
||||
Duration::from_secs(self.lease_duration_secs.max(0) as u64)
|
||||
}
|
||||
|
||||
pub async fn create_and_add_to_db(
|
||||
content: IngestionPayload,
|
||||
user_id: String,
|
||||
db: &SurrealDbClient,
|
||||
) -> Result<IngestionTask, AppError> {
|
||||
let task = Self::new(content, user_id).await;
|
||||
|
||||
let task = Self::new(content, user_id);
|
||||
db.store_item(task.clone()).await?;
|
||||
|
||||
Ok(task)
|
||||
}
|
||||
|
||||
// Update job status
|
||||
pub async fn update_status(
|
||||
id: &str,
|
||||
status: IngestionTaskStatus,
|
||||
pub async fn claim_next_ready(
|
||||
db: &SurrealDbClient,
|
||||
) -> Result<(), AppError> {
|
||||
let _job: Option<Self> = db
|
||||
.update((Self::table_name(), id))
|
||||
.patch(PatchOp::replace("/status", status))
|
||||
.patch(PatchOp::replace(
|
||||
"/updated_at",
|
||||
surrealdb::Datetime::from(Utc::now()),
|
||||
worker_id: &str,
|
||||
now: chrono::DateTime<chrono::Utc>,
|
||||
lease_duration: Duration,
|
||||
) -> Result<Option<IngestionTask>, AppError> {
|
||||
debug_assert!(lifecycle::pending().reserve().is_ok());
|
||||
debug_assert!(lifecycle::failed().reserve().is_ok());
|
||||
|
||||
const CLAIM_QUERY: &str = r#"
|
||||
UPDATE (
|
||||
SELECT * FROM type::table($table)
|
||||
WHERE state IN $candidate_states
|
||||
AND scheduled_at <= $now
|
||||
AND (
|
||||
attempts < max_attempts
|
||||
OR state IN $sticky_states
|
||||
)
|
||||
AND (
|
||||
locked_at = NONE
|
||||
OR time::unix($now) - time::unix(locked_at) >= lease_duration_secs
|
||||
)
|
||||
ORDER BY priority DESC, scheduled_at ASC, created_at ASC
|
||||
LIMIT 1
|
||||
)
|
||||
SET state = $reserved_state,
|
||||
attempts = if state IN $increment_states THEN
|
||||
if attempts + 1 > max_attempts THEN max_attempts ELSE attempts + 1 END
|
||||
ELSE
|
||||
attempts
|
||||
END,
|
||||
locked_at = $now,
|
||||
worker_id = $worker_id,
|
||||
lease_duration_secs = $lease_secs,
|
||||
updated_at = $now
|
||||
RETURN *;
|
||||
"#;
|
||||
|
||||
let mut result = db
|
||||
.client
|
||||
.query(CLAIM_QUERY)
|
||||
.bind(("table", Self::table_name()))
|
||||
.bind((
|
||||
"candidate_states",
|
||||
vec![
|
||||
TaskState::Pending.as_str(),
|
||||
TaskState::Failed.as_str(),
|
||||
TaskState::Reserved.as_str(),
|
||||
TaskState::Processing.as_str(),
|
||||
],
|
||||
))
|
||||
.bind((
|
||||
"sticky_states",
|
||||
vec![TaskState::Reserved.as_str(), TaskState::Processing.as_str()],
|
||||
))
|
||||
.bind((
|
||||
"increment_states",
|
||||
vec![TaskState::Pending.as_str(), TaskState::Failed.as_str()],
|
||||
))
|
||||
.bind(("reserved_state", TaskState::Reserved.as_str()))
|
||||
.bind(("now", SurrealDatetime::from(now)))
|
||||
.bind(("worker_id", worker_id.to_string()))
|
||||
.bind(("lease_secs", lease_duration.as_secs() as i64))
|
||||
.await?;
|
||||
|
||||
Ok(())
|
||||
let task: Option<IngestionTask> = result.take(0)?;
|
||||
Ok(task)
|
||||
}
|
||||
|
||||
/// Listen for new jobs
|
||||
pub async fn listen_for_tasks(
|
||||
pub async fn mark_processing(&self, db: &SurrealDbClient) -> Result<IngestionTask, AppError> {
|
||||
const START_PROCESSING_QUERY: &str = r#"
|
||||
UPDATE type::thing($table, $id)
|
||||
SET state = $processing,
|
||||
updated_at = $now,
|
||||
locked_at = $now
|
||||
WHERE state = $reserved AND worker_id = $worker_id
|
||||
RETURN *;
|
||||
"#;
|
||||
|
||||
let now = chrono::Utc::now();
|
||||
let mut result = db
|
||||
.client
|
||||
.query(START_PROCESSING_QUERY)
|
||||
.bind(("table", Self::table_name()))
|
||||
.bind(("id", self.id.clone()))
|
||||
.bind(("processing", TaskState::Processing.as_str()))
|
||||
.bind(("reserved", TaskState::Reserved.as_str()))
|
||||
.bind(("now", SurrealDatetime::from(now)))
|
||||
.bind(("worker_id", self.worker_id.clone().unwrap_or_default()))
|
||||
.await?;
|
||||
|
||||
let updated: Option<IngestionTask> = result.take(0)?;
|
||||
updated.ok_or_else(|| invalid_transition(&self.state, TaskTransition::StartProcessing))
|
||||
}
|
||||
|
||||
pub async fn mark_succeeded(&self, db: &SurrealDbClient) -> Result<IngestionTask, AppError> {
|
||||
const COMPLETE_QUERY: &str = r#"
|
||||
UPDATE type::thing($table, $id)
|
||||
SET state = $succeeded,
|
||||
updated_at = $now,
|
||||
locked_at = NONE,
|
||||
worker_id = NONE,
|
||||
scheduled_at = $now,
|
||||
error_code = NONE,
|
||||
error_message = NONE,
|
||||
last_error_at = NONE
|
||||
WHERE state = $processing AND worker_id = $worker_id
|
||||
RETURN *;
|
||||
"#;
|
||||
|
||||
let now = chrono::Utc::now();
|
||||
let mut result = db
|
||||
.client
|
||||
.query(COMPLETE_QUERY)
|
||||
.bind(("table", Self::table_name()))
|
||||
.bind(("id", self.id.clone()))
|
||||
.bind(("succeeded", TaskState::Succeeded.as_str()))
|
||||
.bind(("processing", TaskState::Processing.as_str()))
|
||||
.bind(("now", SurrealDatetime::from(now)))
|
||||
.bind(("worker_id", self.worker_id.clone().unwrap_or_default()))
|
||||
.await?;
|
||||
|
||||
let updated: Option<IngestionTask> = result.take(0)?;
|
||||
updated.ok_or_else(|| invalid_transition(&self.state, TaskTransition::Succeed))
|
||||
}
|
||||
|
||||
pub async fn mark_failed(
|
||||
&self,
|
||||
error: TaskErrorInfo,
|
||||
retry_delay: Duration,
|
||||
db: &SurrealDbClient,
|
||||
) -> Result<impl Stream<Item = Result<Notification<Self>, surrealdb::Error>>, surrealdb::Error>
|
||||
{
|
||||
db.listen::<Self>().await
|
||||
) -> Result<IngestionTask, AppError> {
|
||||
let now = chrono::Utc::now();
|
||||
let retry_at = now
|
||||
+ ChronoDuration::from_std(retry_delay).unwrap_or_else(|_| ChronoDuration::seconds(30));
|
||||
|
||||
const FAIL_QUERY: &str = r#"
|
||||
UPDATE type::thing($table, $id)
|
||||
SET state = $failed,
|
||||
updated_at = $now,
|
||||
locked_at = NONE,
|
||||
worker_id = NONE,
|
||||
scheduled_at = $retry_at,
|
||||
error_code = $error_code,
|
||||
error_message = $error_message,
|
||||
last_error_at = $now
|
||||
WHERE state = $processing AND worker_id = $worker_id
|
||||
RETURN *;
|
||||
"#;
|
||||
|
||||
let mut result = db
|
||||
.client
|
||||
.query(FAIL_QUERY)
|
||||
.bind(("table", Self::table_name()))
|
||||
.bind(("id", self.id.clone()))
|
||||
.bind(("failed", TaskState::Failed.as_str()))
|
||||
.bind(("processing", TaskState::Processing.as_str()))
|
||||
.bind(("now", SurrealDatetime::from(now)))
|
||||
.bind(("retry_at", SurrealDatetime::from(retry_at)))
|
||||
.bind(("error_code", error.code.clone()))
|
||||
.bind(("error_message", error.message.clone()))
|
||||
.bind(("worker_id", self.worker_id.clone().unwrap_or_default()))
|
||||
.await?;
|
||||
|
||||
let updated: Option<IngestionTask> = result.take(0)?;
|
||||
updated.ok_or_else(|| invalid_transition(&self.state, TaskTransition::Fail))
|
||||
}
|
||||
|
||||
/// Get all unfinished tasks, ie newly created and in progress up two times
|
||||
pub async fn get_unfinished_tasks(db: &SurrealDbClient) -> Result<Vec<Self>, AppError> {
|
||||
let jobs: Vec<Self> = db
|
||||
pub async fn mark_dead_letter(
|
||||
&self,
|
||||
error: TaskErrorInfo,
|
||||
db: &SurrealDbClient,
|
||||
) -> Result<IngestionTask, AppError> {
|
||||
const DEAD_LETTER_QUERY: &str = r#"
|
||||
UPDATE type::thing($table, $id)
|
||||
SET state = $dead,
|
||||
updated_at = $now,
|
||||
locked_at = NONE,
|
||||
worker_id = NONE,
|
||||
scheduled_at = $now,
|
||||
error_code = $error_code,
|
||||
error_message = $error_message,
|
||||
last_error_at = $now
|
||||
WHERE state = $failed
|
||||
RETURN *;
|
||||
"#;
|
||||
|
||||
let now = chrono::Utc::now();
|
||||
let mut result = db
|
||||
.client
|
||||
.query(DEAD_LETTER_QUERY)
|
||||
.bind(("table", Self::table_name()))
|
||||
.bind(("id", self.id.clone()))
|
||||
.bind(("dead", TaskState::DeadLetter.as_str()))
|
||||
.bind(("failed", TaskState::Failed.as_str()))
|
||||
.bind(("now", SurrealDatetime::from(now)))
|
||||
.bind(("error_code", error.code.clone()))
|
||||
.bind(("error_message", error.message.clone()))
|
||||
.await?;
|
||||
|
||||
let updated: Option<IngestionTask> = result.take(0)?;
|
||||
updated.ok_or_else(|| invalid_transition(&self.state, TaskTransition::DeadLetter))
|
||||
}
|
||||
|
||||
pub async fn mark_cancelled(&self, db: &SurrealDbClient) -> Result<IngestionTask, AppError> {
|
||||
const CANCEL_QUERY: &str = r#"
|
||||
UPDATE type::thing($table, $id)
|
||||
SET state = $cancelled,
|
||||
updated_at = $now,
|
||||
locked_at = NONE,
|
||||
worker_id = NONE
|
||||
WHERE state IN $allow_states
|
||||
RETURN *;
|
||||
"#;
|
||||
|
||||
let now = chrono::Utc::now();
|
||||
let mut result = db
|
||||
.client
|
||||
.query(CANCEL_QUERY)
|
||||
.bind(("table", Self::table_name()))
|
||||
.bind(("id", self.id.clone()))
|
||||
.bind(("cancelled", TaskState::Cancelled.as_str()))
|
||||
.bind((
|
||||
"allow_states",
|
||||
vec![
|
||||
TaskState::Pending.as_str(),
|
||||
TaskState::Reserved.as_str(),
|
||||
TaskState::Processing.as_str(),
|
||||
],
|
||||
))
|
||||
.bind(("now", SurrealDatetime::from(now)))
|
||||
.await?;
|
||||
|
||||
let updated: Option<IngestionTask> = result.take(0)?;
|
||||
updated.ok_or_else(|| invalid_transition(&self.state, TaskTransition::Cancel))
|
||||
}
|
||||
|
||||
pub async fn release(&self, db: &SurrealDbClient) -> Result<IngestionTask, AppError> {
|
||||
const RELEASE_QUERY: &str = r#"
|
||||
UPDATE type::thing($table, $id)
|
||||
SET state = $pending,
|
||||
updated_at = $now,
|
||||
locked_at = NONE,
|
||||
worker_id = NONE
|
||||
WHERE state = $reserved
|
||||
RETURN *;
|
||||
"#;
|
||||
|
||||
let now = chrono::Utc::now();
|
||||
let mut result = db
|
||||
.client
|
||||
.query(RELEASE_QUERY)
|
||||
.bind(("table", Self::table_name()))
|
||||
.bind(("id", self.id.clone()))
|
||||
.bind(("pending", TaskState::Pending.as_str()))
|
||||
.bind(("reserved", TaskState::Reserved.as_str()))
|
||||
.bind(("now", SurrealDatetime::from(now)))
|
||||
.await?;
|
||||
|
||||
let updated: Option<IngestionTask> = result.take(0)?;
|
||||
updated.ok_or_else(|| invalid_transition(&self.state, TaskTransition::Release))
|
||||
}
|
||||
|
||||
pub async fn get_unfinished_tasks(
|
||||
db: &SurrealDbClient,
|
||||
) -> Result<Vec<IngestionTask>, AppError> {
|
||||
let tasks: Vec<IngestionTask> = db
|
||||
.query(
|
||||
"SELECT * FROM type::table($table)
|
||||
WHERE
|
||||
status.name = 'Created'
|
||||
OR (
|
||||
status.name = 'InProgress'
|
||||
AND status.attempts < $max_attempts
|
||||
)
|
||||
ORDER BY created_at ASC",
|
||||
"SELECT * FROM type::table($table)
|
||||
WHERE state IN $active_states
|
||||
ORDER BY scheduled_at ASC, created_at ASC",
|
||||
)
|
||||
.bind(("table", Self::table_name()))
|
||||
.bind(("max_attempts", MAX_ATTEMPTS))
|
||||
.bind((
|
||||
"active_states",
|
||||
vec![
|
||||
TaskState::Pending.as_str(),
|
||||
TaskState::Reserved.as_str(),
|
||||
TaskState::Processing.as_str(),
|
||||
TaskState::Failed.as_str(),
|
||||
],
|
||||
))
|
||||
.await?
|
||||
.take(0)?;
|
||||
|
||||
Ok(jobs)
|
||||
Ok(tasks)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use chrono::Utc;
|
||||
use crate::storage::types::ingestion_payload::IngestionPayload;
|
||||
|
||||
// Helper function to create a test ingestion payload
|
||||
fn create_test_payload(user_id: &str) -> IngestionPayload {
|
||||
fn create_payload(user_id: &str) -> IngestionPayload {
|
||||
IngestionPayload::Text {
|
||||
text: "Test content".to_string(),
|
||||
context: "Test context".to_string(),
|
||||
@@ -119,182 +541,197 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
async fn memory_db() -> SurrealDbClient {
|
||||
let namespace = "test_ns";
|
||||
let database = Uuid::new_v4().to_string();
|
||||
SurrealDbClient::memory(namespace, &database)
|
||||
.await
|
||||
.expect("in-memory surrealdb")
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_new_ingestion_task() {
|
||||
async fn test_new_task_defaults() {
|
||||
let user_id = "user123";
|
||||
let payload = create_test_payload(user_id);
|
||||
let payload = create_payload(user_id);
|
||||
let task = IngestionTask::new(payload.clone(), user_id.to_string());
|
||||
|
||||
let task = IngestionTask::new(payload.clone(), user_id.to_string()).await;
|
||||
|
||||
// Verify task properties
|
||||
assert_eq!(task.user_id, user_id);
|
||||
assert_eq!(task.content, payload);
|
||||
assert!(matches!(task.status, IngestionTaskStatus::Created));
|
||||
assert!(!task.id.is_empty());
|
||||
assert_eq!(task.state, TaskState::Pending);
|
||||
assert_eq!(task.attempts, 0);
|
||||
assert_eq!(task.max_attempts, MAX_ATTEMPTS);
|
||||
assert!(task.locked_at.is_none());
|
||||
assert!(task.worker_id.is_none());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_create_and_add_to_db() {
|
||||
// Setup in-memory database
|
||||
let namespace = "test_ns";
|
||||
let database = &Uuid::new_v4().to_string();
|
||||
let db = SurrealDbClient::memory(namespace, database)
|
||||
.await
|
||||
.expect("Failed to start in-memory surrealdb");
|
||||
|
||||
async fn test_create_and_store_task() {
|
||||
let db = memory_db().await;
|
||||
let user_id = "user123";
|
||||
let payload = create_test_payload(user_id);
|
||||
let payload = create_payload(user_id);
|
||||
|
||||
// Create and store task
|
||||
IngestionTask::create_and_add_to_db(payload.clone(), user_id.to_string(), &db)
|
||||
let created =
|
||||
IngestionTask::create_and_add_to_db(payload.clone(), user_id.to_string(), &db)
|
||||
.await
|
||||
.expect("store");
|
||||
|
||||
let stored: Option<IngestionTask> = db
|
||||
.get_item::<IngestionTask>(&created.id)
|
||||
.await
|
||||
.expect("Failed to create and add task to db");
|
||||
.expect("fetch");
|
||||
|
||||
// Query to verify task was stored
|
||||
let query = format!(
|
||||
"SELECT * FROM {} WHERE user_id = '{}'",
|
||||
IngestionTask::table_name(),
|
||||
user_id
|
||||
);
|
||||
let mut result = db.query(query).await.expect("Query failed");
|
||||
let tasks: Vec<IngestionTask> = result.take(0).unwrap_or_default();
|
||||
|
||||
// Verify task is in the database
|
||||
assert!(!tasks.is_empty(), "Task should exist in the database");
|
||||
let stored_task = &tasks[0];
|
||||
assert_eq!(stored_task.user_id, user_id);
|
||||
assert!(matches!(stored_task.status, IngestionTaskStatus::Created));
|
||||
let stored = stored.expect("task exists");
|
||||
assert_eq!(stored.id, created.id);
|
||||
assert_eq!(stored.state, TaskState::Pending);
|
||||
assert_eq!(stored.attempts, 0);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_update_status() {
|
||||
// Setup in-memory database
|
||||
let namespace = "test_ns";
|
||||
let database = &Uuid::new_v4().to_string();
|
||||
let db = SurrealDbClient::memory(namespace, database)
|
||||
.await
|
||||
.expect("Failed to start in-memory surrealdb");
|
||||
|
||||
async fn test_claim_and_transition() {
|
||||
let db = memory_db().await;
|
||||
let user_id = "user123";
|
||||
let payload = create_test_payload(user_id);
|
||||
let payload = create_payload(user_id);
|
||||
let task = IngestionTask::new(payload, user_id.to_string());
|
||||
db.store_item(task.clone()).await.expect("store");
|
||||
|
||||
// Create task manually
|
||||
let task = IngestionTask::new(payload.clone(), user_id.to_string()).await;
|
||||
let task_id = task.id.clone();
|
||||
let worker_id = "worker-1";
|
||||
let now = chrono::Utc::now();
|
||||
let claimed = IngestionTask::claim_next_ready(&db, worker_id, now, Duration::from_secs(60))
|
||||
.await
|
||||
.expect("claim");
|
||||
|
||||
// Store task
|
||||
db.store_item(task).await.expect("Failed to store task");
|
||||
let claimed = claimed.expect("task claimed");
|
||||
assert_eq!(claimed.state, TaskState::Reserved);
|
||||
assert_eq!(claimed.worker_id.as_deref(), Some(worker_id));
|
||||
|
||||
// Update status to InProgress
|
||||
let now = Utc::now();
|
||||
let new_status = IngestionTaskStatus::InProgress {
|
||||
attempts: 1,
|
||||
last_attempt: now,
|
||||
let processing = claimed.mark_processing(&db).await.expect("processing");
|
||||
assert_eq!(processing.state, TaskState::Processing);
|
||||
|
||||
let succeeded = processing.mark_succeeded(&db).await.expect("succeeded");
|
||||
assert_eq!(succeeded.state, TaskState::Succeeded);
|
||||
assert!(succeeded.worker_id.is_none());
|
||||
assert!(succeeded.locked_at.is_none());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_fail_and_dead_letter() {
|
||||
let db = memory_db().await;
|
||||
let user_id = "user123";
|
||||
let payload = create_payload(user_id);
|
||||
let task = IngestionTask::new(payload, user_id.to_string());
|
||||
db.store_item(task.clone()).await.expect("store");
|
||||
|
||||
let worker_id = "worker-dead";
|
||||
let now = chrono::Utc::now();
|
||||
let claimed = IngestionTask::claim_next_ready(&db, worker_id, now, Duration::from_secs(60))
|
||||
.await
|
||||
.expect("claim")
|
||||
.expect("claimed");
|
||||
|
||||
let processing = claimed.mark_processing(&db).await.expect("processing");
|
||||
|
||||
let error_info = TaskErrorInfo {
|
||||
code: Some("pipeline_error".into()),
|
||||
message: "failed".into(),
|
||||
};
|
||||
|
||||
IngestionTask::update_status(&task_id, new_status.clone(), &db)
|
||||
let failed = processing
|
||||
.mark_failed(error_info.clone(), Duration::from_secs(30), &db)
|
||||
.await
|
||||
.expect("Failed to update status");
|
||||
.expect("failed update");
|
||||
assert_eq!(failed.state, TaskState::Failed);
|
||||
assert_eq!(failed.error_message.as_deref(), Some("failed"));
|
||||
assert!(failed.worker_id.is_none());
|
||||
assert!(failed.locked_at.is_none());
|
||||
assert!(failed.scheduled_at > now);
|
||||
|
||||
// Verify status updated
|
||||
let updated_task: Option<IngestionTask> = db
|
||||
.get_item::<IngestionTask>(&task_id)
|
||||
let dead = failed
|
||||
.mark_dead_letter(error_info.clone(), &db)
|
||||
.await
|
||||
.expect("Failed to get updated task");
|
||||
.expect("dead letter");
|
||||
assert_eq!(dead.state, TaskState::DeadLetter);
|
||||
assert_eq!(dead.error_message.as_deref(), Some("failed"));
|
||||
}
|
||||
|
||||
assert!(updated_task.is_some());
|
||||
let updated_task = updated_task.unwrap();
|
||||
#[tokio::test]
|
||||
async fn test_mark_processing_requires_reservation() {
|
||||
let db = memory_db().await;
|
||||
let user_id = "user123";
|
||||
let payload = create_payload(user_id);
|
||||
|
||||
match updated_task.status {
|
||||
IngestionTaskStatus::InProgress { attempts, .. } => {
|
||||
assert_eq!(attempts, 1);
|
||||
let task = IngestionTask::new(payload.clone(), user_id.to_string());
|
||||
db.store_item(task.clone()).await.expect("store");
|
||||
|
||||
let err = task
|
||||
.mark_processing(&db)
|
||||
.await
|
||||
.expect_err("processing should fail without reservation");
|
||||
|
||||
match err {
|
||||
AppError::Validation(message) => {
|
||||
assert!(
|
||||
message.contains("Pending -> start_processing"),
|
||||
"unexpected message: {message}"
|
||||
);
|
||||
}
|
||||
_ => panic!("Expected InProgress status"),
|
||||
other => panic!("expected validation error, got {other:?}"),
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_get_unfinished_tasks() {
|
||||
// Setup in-memory database
|
||||
let namespace = "test_ns";
|
||||
let database = &Uuid::new_v4().to_string();
|
||||
let db = SurrealDbClient::memory(namespace, database)
|
||||
.await
|
||||
.expect("Failed to start in-memory surrealdb");
|
||||
|
||||
async fn test_mark_failed_requires_processing() {
|
||||
let db = memory_db().await;
|
||||
let user_id = "user123";
|
||||
let payload = create_test_payload(user_id);
|
||||
let payload = create_payload(user_id);
|
||||
|
||||
// Create tasks with different statuses
|
||||
let created_task = IngestionTask::new(payload.clone(), user_id.to_string()).await;
|
||||
let task = IngestionTask::new(payload.clone(), user_id.to_string());
|
||||
db.store_item(task.clone()).await.expect("store");
|
||||
|
||||
let mut in_progress_task = IngestionTask::new(payload.clone(), user_id.to_string()).await;
|
||||
in_progress_task.status = IngestionTaskStatus::InProgress {
|
||||
attempts: 1,
|
||||
last_attempt: Utc::now(),
|
||||
};
|
||||
|
||||
let mut max_attempts_task = IngestionTask::new(payload.clone(), user_id.to_string()).await;
|
||||
max_attempts_task.status = IngestionTaskStatus::InProgress {
|
||||
attempts: MAX_ATTEMPTS,
|
||||
last_attempt: Utc::now(),
|
||||
};
|
||||
|
||||
let mut completed_task = IngestionTask::new(payload.clone(), user_id.to_string()).await;
|
||||
completed_task.status = IngestionTaskStatus::Completed;
|
||||
|
||||
let mut error_task = IngestionTask::new(payload.clone(), user_id.to_string()).await;
|
||||
error_task.status = IngestionTaskStatus::Error {
|
||||
message: "Test error".to_string(),
|
||||
};
|
||||
|
||||
// Store all tasks
|
||||
db.store_item(created_task)
|
||||
let err = task
|
||||
.mark_failed(
|
||||
TaskErrorInfo {
|
||||
code: None,
|
||||
message: "boom".into(),
|
||||
},
|
||||
Duration::from_secs(30),
|
||||
&db,
|
||||
)
|
||||
.await
|
||||
.expect("Failed to store created task");
|
||||
db.store_item(in_progress_task)
|
||||
.await
|
||||
.expect("Failed to store in-progress task");
|
||||
db.store_item(max_attempts_task)
|
||||
.await
|
||||
.expect("Failed to store max-attempts task");
|
||||
db.store_item(completed_task)
|
||||
.await
|
||||
.expect("Failed to store completed task");
|
||||
db.store_item(error_task)
|
||||
.await
|
||||
.expect("Failed to store error task");
|
||||
.expect_err("failing should require processing state");
|
||||
|
||||
// Get unfinished tasks
|
||||
let unfinished_tasks = IngestionTask::get_unfinished_tasks(&db)
|
||||
match err {
|
||||
AppError::Validation(message) => {
|
||||
assert!(
|
||||
message.contains("Pending -> fail"),
|
||||
"unexpected message: {message}"
|
||||
);
|
||||
}
|
||||
other => panic!("expected validation error, got {other:?}"),
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_release_requires_reservation() {
|
||||
let db = memory_db().await;
|
||||
let user_id = "user123";
|
||||
let payload = create_payload(user_id);
|
||||
|
||||
let task = IngestionTask::new(payload.clone(), user_id.to_string());
|
||||
db.store_item(task.clone()).await.expect("store");
|
||||
|
||||
let err = task
|
||||
.release(&db)
|
||||
.await
|
||||
.expect("Failed to get unfinished tasks");
|
||||
.expect_err("release should require reserved state");
|
||||
|
||||
// Verify only Created and InProgress with attempts < MAX_ATTEMPTS are returned
|
||||
assert_eq!(unfinished_tasks.len(), 2);
|
||||
|
||||
let statuses: Vec<_> = unfinished_tasks
|
||||
.iter()
|
||||
.map(|task| match &task.status {
|
||||
IngestionTaskStatus::Created => "Created",
|
||||
IngestionTaskStatus::InProgress { attempts, .. } => {
|
||||
if *attempts < MAX_ATTEMPTS {
|
||||
"InProgress<MAX"
|
||||
} else {
|
||||
"InProgress>=MAX"
|
||||
}
|
||||
}
|
||||
IngestionTaskStatus::Completed => "Completed",
|
||||
IngestionTaskStatus::Error { .. } => "Error",
|
||||
IngestionTaskStatus::Cancelled => "Cancelled",
|
||||
})
|
||||
.collect();
|
||||
|
||||
assert!(statuses.contains(&"Created"));
|
||||
assert!(statuses.contains(&"InProgress<MAX"));
|
||||
assert!(!statuses.contains(&"InProgress>=MAX"));
|
||||
assert!(!statuses.contains(&"Completed"));
|
||||
assert!(!statuses.contains(&"Error"));
|
||||
assert!(!statuses.contains(&"Cancelled"));
|
||||
match err {
|
||||
AppError::Validation(message) => {
|
||||
assert!(
|
||||
message.contains("Pending -> release"),
|
||||
"unexpected message: {message}"
|
||||
);
|
||||
}
|
||||
other => panic!("expected validation error, got {other:?}"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,7 +1,19 @@
|
||||
#![allow(
|
||||
clippy::missing_docs_in_private_items,
|
||||
clippy::module_name_repetitions,
|
||||
clippy::match_same_arms,
|
||||
clippy::format_push_string,
|
||||
clippy::uninlined_format_args,
|
||||
clippy::explicit_iter_loop,
|
||||
clippy::items_after_statements,
|
||||
clippy::get_first,
|
||||
clippy::redundant_closure_for_method_calls
|
||||
)]
|
||||
use std::collections::HashMap;
|
||||
|
||||
use crate::{
|
||||
error::AppError, storage::db::SurrealDbClient, stored_object,
|
||||
error::AppError, storage::db::SurrealDbClient,
|
||||
storage::types::knowledge_entity_embedding::KnowledgeEntityEmbedding, stored_object,
|
||||
utils::embedding::generate_embedding,
|
||||
};
|
||||
use async_openai::{config::OpenAIConfig, Client};
|
||||
@@ -40,16 +52,54 @@ impl From<String> for KnowledgeEntityType {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize, Serialize)]
|
||||
pub struct KnowledgeEntitySearchResult {
|
||||
#[serde(deserialize_with = "deserialize_flexible_id")]
|
||||
pub id: String,
|
||||
#[serde(
|
||||
serialize_with = "serialize_datetime",
|
||||
deserialize_with = "deserialize_datetime",
|
||||
default
|
||||
)]
|
||||
pub created_at: DateTime<Utc>,
|
||||
#[serde(
|
||||
serialize_with = "serialize_datetime",
|
||||
deserialize_with = "deserialize_datetime",
|
||||
default
|
||||
)]
|
||||
pub updated_at: DateTime<Utc>,
|
||||
|
||||
pub source_id: String,
|
||||
pub name: String,
|
||||
pub description: String,
|
||||
pub entity_type: KnowledgeEntityType,
|
||||
#[serde(default)]
|
||||
pub metadata: Option<serde_json::Value>,
|
||||
pub user_id: String,
|
||||
|
||||
pub score: f32,
|
||||
#[serde(default)]
|
||||
pub highlighted_name: Option<String>,
|
||||
#[serde(default)]
|
||||
pub highlighted_description: Option<String>,
|
||||
}
|
||||
|
||||
stored_object!(KnowledgeEntity, "knowledge_entity", {
|
||||
source_id: String,
|
||||
name: String,
|
||||
description: String,
|
||||
entity_type: KnowledgeEntityType,
|
||||
metadata: Option<serde_json::Value>,
|
||||
embedding: Vec<f32>,
|
||||
user_id: String
|
||||
});
|
||||
|
||||
/// Vector search result including hydrated entity.
|
||||
#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
|
||||
pub struct KnowledgeEntityVectorResult {
|
||||
pub entity: KnowledgeEntity,
|
||||
pub score: f32,
|
||||
}
|
||||
|
||||
impl KnowledgeEntity {
|
||||
pub fn new(
|
||||
source_id: String,
|
||||
@@ -57,7 +107,6 @@ impl KnowledgeEntity {
|
||||
description: String,
|
||||
entity_type: KnowledgeEntityType,
|
||||
metadata: Option<serde_json::Value>,
|
||||
embedding: Vec<f32>,
|
||||
user_id: String,
|
||||
) -> Self {
|
||||
let now = Utc::now();
|
||||
@@ -70,25 +119,159 @@ impl KnowledgeEntity {
|
||||
description,
|
||||
entity_type,
|
||||
metadata,
|
||||
embedding,
|
||||
user_id,
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn search(
|
||||
db: &SurrealDbClient,
|
||||
search_terms: &str,
|
||||
user_id: &str,
|
||||
limit: usize,
|
||||
) -> Result<Vec<KnowledgeEntitySearchResult>, AppError> {
|
||||
let sql = r#"
|
||||
SELECT
|
||||
id,
|
||||
created_at,
|
||||
updated_at,
|
||||
source_id,
|
||||
name,
|
||||
description,
|
||||
entity_type,
|
||||
metadata,
|
||||
user_id,
|
||||
search::highlight('<b>', '</b>', 0) AS highlighted_name,
|
||||
search::highlight('<b>', '</b>', 1) AS highlighted_description,
|
||||
(
|
||||
IF search::score(0) != NONE THEN search::score(0) ELSE 0 END +
|
||||
IF search::score(1) != NONE THEN search::score(1) ELSE 0 END
|
||||
) AS score
|
||||
FROM knowledge_entity
|
||||
WHERE
|
||||
(
|
||||
name @0@ $terms OR
|
||||
description @1@ $terms
|
||||
)
|
||||
AND user_id = $user_id
|
||||
ORDER BY score DESC
|
||||
LIMIT $limit;
|
||||
"#;
|
||||
|
||||
Ok(db
|
||||
.client
|
||||
.query(sql)
|
||||
.bind(("terms", search_terms.to_owned()))
|
||||
.bind(("user_id", user_id.to_owned()))
|
||||
.bind(("limit", limit))
|
||||
.await?
|
||||
.take(0)?)
|
||||
}
|
||||
|
||||
pub async fn delete_by_source_id(
|
||||
source_id: &str,
|
||||
db_client: &SurrealDbClient,
|
||||
) -> Result<(), AppError> {
|
||||
let query = format!(
|
||||
"DELETE {} WHERE source_id = '{}'",
|
||||
Self::table_name(),
|
||||
source_id
|
||||
);
|
||||
db_client.query(query).await?;
|
||||
// Delete embeddings first, while we can still look them up via the entity's source_id
|
||||
KnowledgeEntityEmbedding::delete_by_source_id(source_id, db_client).await?;
|
||||
|
||||
db_client
|
||||
.client
|
||||
.query("DELETE FROM type::table($table) WHERE source_id = $source_id")
|
||||
.bind(("table", Self::table_name()))
|
||||
.bind(("source_id", source_id.to_owned()))
|
||||
.await
|
||||
.map_err(AppError::Database)?
|
||||
.check()
|
||||
.map_err(AppError::Database)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Atomically store a knowledge entity and its embedding.
|
||||
/// Writes the entity to `knowledge_entity` and the embedding to `knowledge_entity_embedding`.
|
||||
pub async fn store_with_embedding(
|
||||
entity: KnowledgeEntity,
|
||||
embedding: Vec<f32>,
|
||||
db: &SurrealDbClient,
|
||||
) -> Result<(), AppError> {
|
||||
let emb = KnowledgeEntityEmbedding::new(&entity.id, embedding, entity.user_id.clone());
|
||||
|
||||
let query = format!(
|
||||
"
|
||||
BEGIN TRANSACTION;
|
||||
CREATE type::thing('{entity_table}', $entity_id) CONTENT $entity;
|
||||
CREATE type::thing('{emb_table}', $emb_id) CONTENT $emb;
|
||||
COMMIT TRANSACTION;
|
||||
",
|
||||
entity_table = Self::table_name(),
|
||||
emb_table = KnowledgeEntityEmbedding::table_name(),
|
||||
);
|
||||
|
||||
db.client
|
||||
.query(query)
|
||||
.bind(("entity_id", entity.id.clone()))
|
||||
.bind(("entity", entity))
|
||||
.bind(("emb_id", emb.id.clone()))
|
||||
.bind(("emb", emb))
|
||||
.await
|
||||
.map_err(AppError::Database)?
|
||||
.check()
|
||||
.map_err(AppError::Database)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Vector search over knowledge entities using the embedding table, fetching full entity rows and scores.
|
||||
pub async fn vector_search(
|
||||
take: usize,
|
||||
query_embedding: Vec<f32>,
|
||||
db: &SurrealDbClient,
|
||||
user_id: &str,
|
||||
) -> Result<Vec<KnowledgeEntityVectorResult>, AppError> {
|
||||
#[derive(Deserialize)]
|
||||
struct Row {
|
||||
entity_id: Option<KnowledgeEntity>,
|
||||
score: f32,
|
||||
}
|
||||
|
||||
let sql = format!(
|
||||
r#"
|
||||
SELECT
|
||||
entity_id,
|
||||
vector::similarity::cosine(embedding, $embedding) AS score
|
||||
FROM {emb_table}
|
||||
WHERE user_id = $user_id
|
||||
AND embedding <|{take},100|> $embedding
|
||||
ORDER BY score DESC
|
||||
LIMIT {take}
|
||||
FETCH entity_id;
|
||||
"#,
|
||||
emb_table = KnowledgeEntityEmbedding::table_name(),
|
||||
take = take
|
||||
);
|
||||
|
||||
let mut response = db
|
||||
.query(&sql)
|
||||
.bind(("embedding", query_embedding))
|
||||
.bind(("user_id", user_id.to_string()))
|
||||
.await
|
||||
.map_err(|e| AppError::InternalError(format!("Surreal query failed: {e}")))?;
|
||||
|
||||
response = response.check().map_err(AppError::Database)?;
|
||||
|
||||
let rows: Vec<Row> = response.take::<Vec<Row>>(0).map_err(AppError::Database)?;
|
||||
|
||||
Ok(rows
|
||||
.into_iter()
|
||||
.filter_map(|r| {
|
||||
r.entity_id.map(|entity| KnowledgeEntityVectorResult {
|
||||
entity,
|
||||
score: r.score,
|
||||
})
|
||||
})
|
||||
.collect())
|
||||
}
|
||||
|
||||
pub async fn patch(
|
||||
id: &str,
|
||||
name: &str,
|
||||
@@ -102,32 +285,55 @@ impl KnowledgeEntity {
|
||||
name, description, entity_type
|
||||
);
|
||||
let embedding = generate_embedding(ai_client, &embedding_input, db_client).await?;
|
||||
let user_id = Self::get_user_id_by_id(id, db_client).await?;
|
||||
let emb = KnowledgeEntityEmbedding::new(id, embedding, user_id);
|
||||
|
||||
let now = Utc::now();
|
||||
|
||||
db_client
|
||||
.client
|
||||
.query(
|
||||
"UPDATE type::thing($table, $id)
|
||||
SET name = $name,
|
||||
description = $description,
|
||||
updated_at = $updated_at,
|
||||
entity_type = $entity_type,
|
||||
embedding = $embedding
|
||||
RETURN AFTER",
|
||||
"BEGIN TRANSACTION;
|
||||
UPDATE type::thing($table, $id)
|
||||
SET name = $name,
|
||||
description = $description,
|
||||
updated_at = $updated_at,
|
||||
entity_type = $entity_type;
|
||||
UPSERT type::thing($emb_table, $emb_id) CONTENT $emb;
|
||||
COMMIT TRANSACTION;",
|
||||
)
|
||||
.bind(("table", Self::table_name()))
|
||||
.bind(("emb_table", KnowledgeEntityEmbedding::table_name()))
|
||||
.bind(("id", id.to_string()))
|
||||
.bind(("name", name.to_string()))
|
||||
.bind(("updated_at", surrealdb::Datetime::from(now)))
|
||||
.bind(("entity_type", entity_type.to_owned()))
|
||||
.bind(("embedding", embedding))
|
||||
.bind(("emb_id", emb.id.clone()))
|
||||
.bind(("emb", emb))
|
||||
.bind(("description", description.to_string()))
|
||||
.await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn get_user_id_by_id(id: &str, db_client: &SurrealDbClient) -> Result<String, AppError> {
|
||||
let mut response = db_client
|
||||
.client
|
||||
.query("SELECT user_id FROM type::thing($table, $id) LIMIT 1")
|
||||
.bind(("table", Self::table_name()))
|
||||
.bind(("id", id.to_string()))
|
||||
.await
|
||||
.map_err(AppError::Database)?;
|
||||
#[derive(Deserialize)]
|
||||
struct Row {
|
||||
user_id: String,
|
||||
}
|
||||
let rows: Vec<Row> = response.take(0).map_err(AppError::Database)?;
|
||||
rows.get(0)
|
||||
.map(|r| r.user_id.clone())
|
||||
.ok_or_else(|| AppError::InternalError("user not found for entity".to_string()))
|
||||
}
|
||||
|
||||
/// Re-creates embeddings for all knowledge entities in the database.
|
||||
///
|
||||
/// This is a costly operation that should be run in the background. It follows the same
|
||||
@@ -150,13 +356,15 @@ impl KnowledgeEntity {
|
||||
let all_entities: Vec<KnowledgeEntity> = db.select(Self::table_name()).await?;
|
||||
let total_entities = all_entities.len();
|
||||
if total_entities == 0 {
|
||||
info!("No knowledge entities to update. Skipping.");
|
||||
info!("No knowledge entities to update. Just updating the idx");
|
||||
|
||||
KnowledgeEntityEmbedding::redefine_hnsw_index(db, new_dimensions as usize).await?;
|
||||
return Ok(());
|
||||
}
|
||||
info!("Found {} entities to process.", total_entities);
|
||||
|
||||
// Generate all new embeddings in memory
|
||||
let mut new_embeddings: HashMap<String, Vec<f32>> = HashMap::new();
|
||||
let mut new_embeddings: HashMap<String, (Vec<f32>, String)> = HashMap::new();
|
||||
info!("Generating new embeddings for all entities...");
|
||||
for entity in all_entities.iter() {
|
||||
let embedding_input = format!(
|
||||
@@ -184,17 +392,16 @@ impl KnowledgeEntity {
|
||||
error!("{}", err_msg);
|
||||
return Err(AppError::InternalError(err_msg));
|
||||
}
|
||||
new_embeddings.insert(entity.id.clone(), embedding);
|
||||
new_embeddings.insert(entity.id.clone(), (embedding, entity.user_id.clone()));
|
||||
}
|
||||
info!("Successfully generated all new embeddings.");
|
||||
|
||||
// Perform DB updates in a single transaction
|
||||
info!("Applying schema and data changes in a transaction...");
|
||||
info!("Applying embedding updates in a transaction...");
|
||||
let mut transaction_query = String::from("BEGIN TRANSACTION;");
|
||||
|
||||
// Add all update statements
|
||||
for (id, embedding) in new_embeddings {
|
||||
// We must properly serialize the vector for the SurrealQL query string
|
||||
// Add all update statements to the embedding table
|
||||
for (id, (embedding, user_id)) in new_embeddings {
|
||||
let embedding_str = format!(
|
||||
"[{}]",
|
||||
embedding
|
||||
@@ -204,18 +411,22 @@ impl KnowledgeEntity {
|
||||
.join(",")
|
||||
);
|
||||
transaction_query.push_str(&format!(
|
||||
"UPDATE type::thing('knowledge_entity', '{}') SET embedding = {}, updated_at = time::now();",
|
||||
id, embedding_str
|
||||
));
|
||||
"UPSERT type::thing('knowledge_entity_embedding', '{id}') SET \
|
||||
entity_id = type::thing('knowledge_entity', '{id}'), \
|
||||
embedding = {embedding}, \
|
||||
user_id = '{user_id}', \
|
||||
created_at = IF created_at != NONE THEN created_at ELSE time::now() END, \
|
||||
updated_at = time::now();",
|
||||
id = id,
|
||||
embedding = embedding_str,
|
||||
user_id = user_id
|
||||
));
|
||||
}
|
||||
|
||||
// Re-create the index after updating the data that it will index
|
||||
transaction_query
|
||||
.push_str("REMOVE INDEX idx_embedding_entities ON TABLE knowledge_entity;");
|
||||
transaction_query.push_str(&format!(
|
||||
"DEFINE INDEX idx_embedding_entities ON TABLE knowledge_entity FIELDS embedding HNSW DIMENSION {};",
|
||||
new_dimensions
|
||||
));
|
||||
"DEFINE INDEX OVERWRITE idx_embedding_knowledge_entity_embedding ON TABLE knowledge_entity_embedding FIELDS embedding HNSW DIMENSION {};",
|
||||
new_dimensions
|
||||
));
|
||||
|
||||
transaction_query.push_str("COMMIT TRANSACTION;");
|
||||
|
||||
@@ -225,12 +436,146 @@ impl KnowledgeEntity {
|
||||
info!("Re-embedding process for knowledge entities completed successfully.");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Re-creates embeddings for all knowledge entities using an `EmbeddingProvider`.
|
||||
///
|
||||
/// This variant uses the application's configured embedding provider (FastEmbed, OpenAI, etc.)
|
||||
/// instead of directly calling OpenAI. Used during startup when embedding configuration changes.
|
||||
pub async fn update_all_embeddings_with_provider(
|
||||
db: &SurrealDbClient,
|
||||
provider: &crate::utils::embedding::EmbeddingProvider,
|
||||
) -> Result<(), AppError> {
|
||||
let new_dimensions = provider.dimension();
|
||||
info!(
|
||||
dimensions = new_dimensions,
|
||||
backend = provider.backend_label(),
|
||||
"Starting re-embedding process for all knowledge entities"
|
||||
);
|
||||
|
||||
// Fetch all entities first
|
||||
let all_entities: Vec<KnowledgeEntity> = db.select(Self::table_name()).await?;
|
||||
let total_entities = all_entities.len();
|
||||
if total_entities == 0 {
|
||||
info!("No knowledge entities to update. Just updating the index.");
|
||||
KnowledgeEntityEmbedding::redefine_hnsw_index(db, new_dimensions).await?;
|
||||
return Ok(());
|
||||
}
|
||||
info!(entities = total_entities, "Found entities to process");
|
||||
|
||||
// Generate all new embeddings in memory
|
||||
let mut new_embeddings: HashMap<String, (Vec<f32>, String)> = HashMap::new();
|
||||
info!("Generating new embeddings for all entities...");
|
||||
|
||||
for (i, entity) in all_entities.iter().enumerate() {
|
||||
if i > 0 && i % 100 == 0 {
|
||||
info!(
|
||||
progress = i,
|
||||
total = total_entities,
|
||||
"Re-embedding progress"
|
||||
);
|
||||
}
|
||||
|
||||
let embedding_input = format!(
|
||||
"name: {}, description: {}, type: {:?}",
|
||||
entity.name, entity.description, entity.entity_type
|
||||
);
|
||||
|
||||
let embedding = provider
|
||||
.embed(&embedding_input)
|
||||
.await
|
||||
.map_err(|e| AppError::InternalError(format!("Embedding failed: {e}")))?;
|
||||
|
||||
// Safety check: ensure the generated embedding has the correct dimension.
|
||||
if embedding.len() != new_dimensions {
|
||||
let err_msg = format!(
|
||||
"CRITICAL: Generated embedding for entity {} has incorrect dimension ({}). Expected {}. Aborting.",
|
||||
entity.id, embedding.len(), new_dimensions
|
||||
);
|
||||
error!("{}", err_msg);
|
||||
return Err(AppError::InternalError(err_msg));
|
||||
}
|
||||
new_embeddings.insert(entity.id.clone(), (embedding, entity.user_id.clone()));
|
||||
}
|
||||
info!("Successfully generated all new embeddings.");
|
||||
info!("Successfully generated all new embeddings.");
|
||||
|
||||
// Clear existing embeddings and index first to prevent SurrealDB panics and dimension conflicts.
|
||||
info!("Removing old index and clearing embeddings...");
|
||||
|
||||
// Explicitly remove the index first. This prevents background HNSW maintenance from crashing
|
||||
// when we delete/replace data, dealing with a known SurrealDB panic.
|
||||
db.client
|
||||
.query(format!(
|
||||
"REMOVE INDEX idx_embedding_knowledge_entity_embedding ON TABLE {};",
|
||||
KnowledgeEntityEmbedding::table_name()
|
||||
))
|
||||
.await
|
||||
.map_err(AppError::Database)?
|
||||
.check()
|
||||
.map_err(AppError::Database)?;
|
||||
|
||||
db.client
|
||||
.query(format!(
|
||||
"DELETE FROM {};",
|
||||
KnowledgeEntityEmbedding::table_name()
|
||||
))
|
||||
.await
|
||||
.map_err(AppError::Database)?
|
||||
.check()
|
||||
.map_err(AppError::Database)?;
|
||||
|
||||
// Perform DB updates in a single transaction
|
||||
info!("Applying embedding updates in a transaction...");
|
||||
let mut transaction_query = String::from("BEGIN TRANSACTION;");
|
||||
|
||||
for (id, (embedding, user_id)) in new_embeddings {
|
||||
let embedding_str = format!(
|
||||
"[{}]",
|
||||
embedding
|
||||
.iter()
|
||||
.map(|f| f.to_string())
|
||||
.collect::<Vec<_>>()
|
||||
.join(",")
|
||||
);
|
||||
transaction_query.push_str(&format!(
|
||||
"CREATE type::thing('knowledge_entity_embedding', '{id}') SET \
|
||||
entity_id = type::thing('knowledge_entity', '{id}'), \
|
||||
embedding = {embedding}, \
|
||||
user_id = '{user_id}', \
|
||||
created_at = time::now(), \
|
||||
updated_at = time::now();",
|
||||
id = id,
|
||||
embedding = embedding_str,
|
||||
user_id = user_id
|
||||
));
|
||||
}
|
||||
|
||||
transaction_query.push_str(&format!(
|
||||
"DEFINE INDEX OVERWRITE idx_embedding_knowledge_entity_embedding ON TABLE knowledge_entity_embedding FIELDS embedding HNSW DIMENSION {};",
|
||||
new_dimensions
|
||||
));
|
||||
|
||||
transaction_query.push_str("COMMIT TRANSACTION;");
|
||||
|
||||
// Execute the entire atomic operation
|
||||
db.client
|
||||
.query(transaction_query)
|
||||
.await
|
||||
.map_err(AppError::Database)?
|
||||
.check()
|
||||
.map_err(AppError::Database)?;
|
||||
|
||||
info!("Re-embedding process for knowledge entities completed successfully.");
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::storage::types::knowledge_entity_embedding::KnowledgeEntityEmbedding;
|
||||
use serde_json::json;
|
||||
use uuid::Uuid;
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_knowledge_entity_creation() {
|
||||
@@ -240,7 +585,6 @@ mod tests {
|
||||
let description = "Test Description".to_string();
|
||||
let entity_type = KnowledgeEntityType::Document;
|
||||
let metadata = Some(json!({"key": "value"}));
|
||||
let embedding = vec![0.1, 0.2, 0.3, 0.4, 0.5];
|
||||
let user_id = "user123".to_string();
|
||||
|
||||
let entity = KnowledgeEntity::new(
|
||||
@@ -249,7 +593,6 @@ mod tests {
|
||||
description.clone(),
|
||||
entity_type.clone(),
|
||||
metadata.clone(),
|
||||
embedding.clone(),
|
||||
user_id.clone(),
|
||||
);
|
||||
|
||||
@@ -259,7 +602,6 @@ mod tests {
|
||||
assert_eq!(entity.description, description);
|
||||
assert_eq!(entity.entity_type, entity_type);
|
||||
assert_eq!(entity.metadata, metadata);
|
||||
assert_eq!(entity.embedding, embedding);
|
||||
assert_eq!(entity.user_id, user_id);
|
||||
assert!(!entity.id.is_empty());
|
||||
}
|
||||
@@ -323,20 +665,25 @@ mod tests {
|
||||
let db = SurrealDbClient::memory(namespace, database)
|
||||
.await
|
||||
.expect("Failed to start in-memory surrealdb");
|
||||
db.apply_migrations()
|
||||
.await
|
||||
.expect("Failed to apply migrations");
|
||||
|
||||
// Create two entities with the same source_id
|
||||
let source_id = "source123".to_string();
|
||||
let entity_type = KnowledgeEntityType::Document;
|
||||
let embedding = vec![0.1, 0.2, 0.3, 0.4, 0.5];
|
||||
let user_id = "user123".to_string();
|
||||
|
||||
KnowledgeEntityEmbedding::redefine_hnsw_index(&db, 5)
|
||||
.await
|
||||
.expect("Failed to redefine index length");
|
||||
|
||||
let entity1 = KnowledgeEntity::new(
|
||||
source_id.clone(),
|
||||
"Entity 1".to_string(),
|
||||
"Description 1".to_string(),
|
||||
entity_type.clone(),
|
||||
None,
|
||||
embedding.clone(),
|
||||
user_id.clone(),
|
||||
);
|
||||
|
||||
@@ -346,7 +693,6 @@ mod tests {
|
||||
"Description 2".to_string(),
|
||||
entity_type.clone(),
|
||||
None,
|
||||
embedding.clone(),
|
||||
user_id.clone(),
|
||||
);
|
||||
|
||||
@@ -358,18 +704,18 @@ mod tests {
|
||||
"Different Description".to_string(),
|
||||
entity_type.clone(),
|
||||
None,
|
||||
embedding.clone(),
|
||||
user_id.clone(),
|
||||
);
|
||||
|
||||
let emb = vec![0.1, 0.2, 0.3, 0.4, 0.5];
|
||||
// Store the entities
|
||||
db.store_item(entity1)
|
||||
KnowledgeEntity::store_with_embedding(entity1.clone(), emb.clone(), &db)
|
||||
.await
|
||||
.expect("Failed to store entity 1");
|
||||
db.store_item(entity2)
|
||||
KnowledgeEntity::store_with_embedding(entity2.clone(), emb.clone(), &db)
|
||||
.await
|
||||
.expect("Failed to store entity 2");
|
||||
db.store_item(different_entity.clone())
|
||||
KnowledgeEntity::store_with_embedding(different_entity.clone(), emb.clone(), &db)
|
||||
.await
|
||||
.expect("Failed to store different entity");
|
||||
|
||||
@@ -418,6 +764,271 @@ mod tests {
|
||||
assert_eq!(different_remaining[0].id, different_entity.id);
|
||||
}
|
||||
|
||||
// Note: We can't easily test the patch method without mocking the OpenAI client
|
||||
// and the generate_embedding function. This would require more complex setup.
|
||||
#[tokio::test]
|
||||
async fn test_delete_by_source_id_resists_query_injection() {
|
||||
let namespace = "test_ns";
|
||||
let database = &Uuid::new_v4().to_string();
|
||||
let db = SurrealDbClient::memory(namespace, database)
|
||||
.await
|
||||
.expect("Failed to start in-memory surrealdb");
|
||||
db.apply_migrations()
|
||||
.await
|
||||
.expect("Failed to apply migrations");
|
||||
|
||||
KnowledgeEntityEmbedding::redefine_hnsw_index(&db, 3)
|
||||
.await
|
||||
.expect("Failed to redefine index length");
|
||||
|
||||
let user_id = "user123".to_string();
|
||||
|
||||
let entity1 = KnowledgeEntity::new(
|
||||
"safe_source".to_string(),
|
||||
"Entity 1".to_string(),
|
||||
"Description 1".to_string(),
|
||||
KnowledgeEntityType::Document,
|
||||
None,
|
||||
user_id.clone(),
|
||||
);
|
||||
|
||||
let entity2 = KnowledgeEntity::new(
|
||||
"other_source".to_string(),
|
||||
"Entity 2".to_string(),
|
||||
"Description 2".to_string(),
|
||||
KnowledgeEntityType::Document,
|
||||
None,
|
||||
user_id,
|
||||
);
|
||||
|
||||
KnowledgeEntity::store_with_embedding(entity1, vec![0.1, 0.2, 0.3], &db)
|
||||
.await
|
||||
.expect("store entity1");
|
||||
KnowledgeEntity::store_with_embedding(entity2, vec![0.3, 0.2, 0.1], &db)
|
||||
.await
|
||||
.expect("store entity2");
|
||||
|
||||
let malicious_source = "safe_source' OR 1=1 --";
|
||||
KnowledgeEntity::delete_by_source_id(malicious_source, &db)
|
||||
.await
|
||||
.expect("delete call should succeed");
|
||||
|
||||
let remaining: Vec<KnowledgeEntity> = db
|
||||
.client
|
||||
.query("SELECT * FROM type::table($table)")
|
||||
.bind(("table", KnowledgeEntity::table_name()))
|
||||
.await
|
||||
.expect("query failed")
|
||||
.take(0)
|
||||
.expect("take failed");
|
||||
|
||||
assert_eq!(
|
||||
remaining.len(),
|
||||
2,
|
||||
"malicious input must not delete unrelated entities"
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_vector_search_returns_empty_when_no_embeddings() {
|
||||
let namespace = "test_ns";
|
||||
let database = &Uuid::new_v4().to_string();
|
||||
let db = SurrealDbClient::memory(namespace, database)
|
||||
.await
|
||||
.expect("Failed to start in-memory surrealdb");
|
||||
db.apply_migrations()
|
||||
.await
|
||||
.expect("Failed to apply migrations");
|
||||
|
||||
KnowledgeEntityEmbedding::redefine_hnsw_index(&db, 3)
|
||||
.await
|
||||
.expect("Failed to redefine index length");
|
||||
|
||||
let results = KnowledgeEntity::vector_search(5, vec![0.1, 0.2, 0.3], &db, "user")
|
||||
.await
|
||||
.expect("vector search");
|
||||
assert!(results.is_empty());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_vector_search_single_result() {
|
||||
let namespace = "test_ns";
|
||||
let database = &Uuid::new_v4().to_string();
|
||||
let db = SurrealDbClient::memory(namespace, database)
|
||||
.await
|
||||
.expect("Failed to start in-memory surrealdb");
|
||||
db.apply_migrations()
|
||||
.await
|
||||
.expect("Failed to apply migrations");
|
||||
|
||||
KnowledgeEntityEmbedding::redefine_hnsw_index(&db, 3)
|
||||
.await
|
||||
.expect("Failed to redefine index length");
|
||||
|
||||
let user_id = "user".to_string();
|
||||
let source_id = "src".to_string();
|
||||
let entity = KnowledgeEntity::new(
|
||||
source_id.clone(),
|
||||
"hello".to_string(),
|
||||
"world".to_string(),
|
||||
KnowledgeEntityType::Document,
|
||||
None,
|
||||
user_id.clone(),
|
||||
);
|
||||
|
||||
KnowledgeEntity::store_with_embedding(entity.clone(), vec![0.1, 0.2, 0.3], &db)
|
||||
.await
|
||||
.expect("store entity with embedding");
|
||||
|
||||
let stored_entity: Option<KnowledgeEntity> = db.get_item(&entity.id).await.unwrap();
|
||||
assert!(stored_entity.is_some());
|
||||
|
||||
let stored_embeddings: Vec<KnowledgeEntityEmbedding> = db
|
||||
.client
|
||||
.query(format!(
|
||||
"SELECT * FROM {}",
|
||||
KnowledgeEntityEmbedding::table_name()
|
||||
))
|
||||
.await
|
||||
.expect("query embeddings")
|
||||
.take(0)
|
||||
.expect("take embeddings");
|
||||
assert_eq!(stored_embeddings.len(), 1);
|
||||
|
||||
let rid = surrealdb::RecordId::from_table_key(KnowledgeEntity::table_name(), &entity.id);
|
||||
let fetched_emb = KnowledgeEntityEmbedding::get_by_entity_id(&rid, &db)
|
||||
.await
|
||||
.expect("fetch embedding");
|
||||
assert!(fetched_emb.is_some());
|
||||
|
||||
let results = KnowledgeEntity::vector_search(3, vec![0.1, 0.2, 0.3], &db, &user_id)
|
||||
.await
|
||||
.expect("vector search");
|
||||
|
||||
assert_eq!(results.len(), 1);
|
||||
let res = &results[0];
|
||||
assert_eq!(res.entity.id, entity.id);
|
||||
assert_eq!(res.entity.source_id, source_id);
|
||||
assert_eq!(res.entity.name, "hello");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_vector_search_orders_by_similarity() {
|
||||
let namespace = "test_ns";
|
||||
let database = &Uuid::new_v4().to_string();
|
||||
let db = SurrealDbClient::memory(namespace, database)
|
||||
.await
|
||||
.expect("Failed to start in-memory surrealdb");
|
||||
db.apply_migrations()
|
||||
.await
|
||||
.expect("Failed to apply migrations");
|
||||
|
||||
KnowledgeEntityEmbedding::redefine_hnsw_index(&db, 3)
|
||||
.await
|
||||
.expect("Failed to redefine index length");
|
||||
|
||||
let user_id = "user".to_string();
|
||||
let e1 = KnowledgeEntity::new(
|
||||
"s1".to_string(),
|
||||
"entity one".to_string(),
|
||||
"desc".to_string(),
|
||||
KnowledgeEntityType::Document,
|
||||
None,
|
||||
user_id.clone(),
|
||||
);
|
||||
let e2 = KnowledgeEntity::new(
|
||||
"s2".to_string(),
|
||||
"entity two".to_string(),
|
||||
"desc".to_string(),
|
||||
KnowledgeEntityType::Document,
|
||||
None,
|
||||
user_id.clone(),
|
||||
);
|
||||
|
||||
KnowledgeEntity::store_with_embedding(e1.clone(), vec![1.0, 0.0, 0.0], &db)
|
||||
.await
|
||||
.expect("store e1");
|
||||
KnowledgeEntity::store_with_embedding(e2.clone(), vec![0.0, 1.0, 0.0], &db)
|
||||
.await
|
||||
.expect("store e2");
|
||||
|
||||
let stored_e1: Option<KnowledgeEntity> = db.get_item(&e1.id).await.unwrap();
|
||||
let stored_e2: Option<KnowledgeEntity> = db.get_item(&e2.id).await.unwrap();
|
||||
assert!(stored_e1.is_some() && stored_e2.is_some());
|
||||
|
||||
let stored_embeddings: Vec<KnowledgeEntityEmbedding> = db
|
||||
.client
|
||||
.query(format!(
|
||||
"SELECT * FROM {}",
|
||||
KnowledgeEntityEmbedding::table_name()
|
||||
))
|
||||
.await
|
||||
.expect("query embeddings")
|
||||
.take(0)
|
||||
.expect("take embeddings");
|
||||
assert_eq!(stored_embeddings.len(), 2);
|
||||
|
||||
let rid_e1 = surrealdb::RecordId::from_table_key(KnowledgeEntity::table_name(), &e1.id);
|
||||
let rid_e2 = surrealdb::RecordId::from_table_key(KnowledgeEntity::table_name(), &e2.id);
|
||||
assert!(KnowledgeEntityEmbedding::get_by_entity_id(&rid_e1, &db)
|
||||
.await
|
||||
.unwrap()
|
||||
.is_some());
|
||||
assert!(KnowledgeEntityEmbedding::get_by_entity_id(&rid_e2, &db)
|
||||
.await
|
||||
.unwrap()
|
||||
.is_some());
|
||||
|
||||
let results = KnowledgeEntity::vector_search(2, vec![0.0, 1.0, 0.0], &db, &user_id)
|
||||
.await
|
||||
.expect("vector search");
|
||||
|
||||
assert_eq!(results.len(), 2);
|
||||
assert_eq!(results[0].entity.id, e2.id);
|
||||
assert_eq!(results[1].entity.id, e1.id);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_vector_search_with_orphaned_embedding() {
|
||||
let namespace = "test_ns_orphan";
|
||||
let database = &Uuid::new_v4().to_string();
|
||||
let db = SurrealDbClient::memory(namespace, database)
|
||||
.await
|
||||
.expect("Failed to start in-memory surrealdb");
|
||||
db.apply_migrations()
|
||||
.await
|
||||
.expect("Failed to apply migrations");
|
||||
|
||||
KnowledgeEntityEmbedding::redefine_hnsw_index(&db, 3)
|
||||
.await
|
||||
.expect("Failed to redefine index length");
|
||||
|
||||
let user_id = "user".to_string();
|
||||
let source_id = "src".to_string();
|
||||
let entity = KnowledgeEntity::new(
|
||||
source_id.clone(),
|
||||
"orphan".to_string(),
|
||||
"orphan desc".to_string(),
|
||||
KnowledgeEntityType::Document,
|
||||
None,
|
||||
user_id.clone(),
|
||||
);
|
||||
|
||||
KnowledgeEntity::store_with_embedding(entity.clone(), vec![0.1, 0.2, 0.3], &db)
|
||||
.await
|
||||
.expect("store entity with embedding");
|
||||
|
||||
// Manually delete the entity to create an orphan
|
||||
let query = format!("DELETE type::thing('knowledge_entity', '{}')", entity.id);
|
||||
db.client.query(query).await.expect("delete entity");
|
||||
|
||||
// Now search
|
||||
let results = KnowledgeEntity::vector_search(3, vec![0.1, 0.2, 0.3], &db, &user_id)
|
||||
.await
|
||||
.expect("search should succeed even with orphans");
|
||||
|
||||
assert!(
|
||||
results.is_empty(),
|
||||
"Should return empty result for orphan, got: {:?}",
|
||||
results
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
387
common/src/storage/types/knowledge_entity_embedding.rs
Normal file
387
common/src/storage/types/knowledge_entity_embedding.rs
Normal file
@@ -0,0 +1,387 @@
|
||||
use std::collections::HashMap;
|
||||
|
||||
use surrealdb::RecordId;
|
||||
|
||||
use crate::{error::AppError, storage::db::SurrealDbClient, stored_object};
|
||||
|
||||
stored_object!(KnowledgeEntityEmbedding, "knowledge_entity_embedding", {
|
||||
entity_id: RecordId,
|
||||
embedding: Vec<f32>,
|
||||
/// Denormalized user id for query scoping
|
||||
user_id: String
|
||||
});
|
||||
|
||||
impl KnowledgeEntityEmbedding {
|
||||
/// Recreate the HNSW index with a new embedding dimension.
|
||||
pub async fn redefine_hnsw_index(
|
||||
db: &SurrealDbClient,
|
||||
dimension: usize,
|
||||
) -> Result<(), AppError> {
|
||||
let query = format!(
|
||||
"BEGIN TRANSACTION;
|
||||
REMOVE INDEX IF EXISTS idx_embedding_knowledge_entity_embedding ON TABLE {table};
|
||||
DEFINE INDEX idx_embedding_knowledge_entity_embedding ON TABLE {table} FIELDS embedding HNSW DIMENSION {dimension};
|
||||
COMMIT TRANSACTION;",
|
||||
table = Self::table_name(),
|
||||
);
|
||||
|
||||
let res = db.client.query(query).await.map_err(AppError::Database)?;
|
||||
res.check().map_err(AppError::Database)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Create a new knowledge entity embedding
|
||||
pub fn new(entity_id: &str, embedding: Vec<f32>, user_id: String) -> Self {
|
||||
let now = Utc::now();
|
||||
Self {
|
||||
id: uuid::Uuid::new_v4().to_string(),
|
||||
created_at: now,
|
||||
updated_at: now,
|
||||
entity_id: RecordId::from_table_key("knowledge_entity", entity_id),
|
||||
embedding,
|
||||
user_id,
|
||||
}
|
||||
}
|
||||
|
||||
/// Get embedding by entity ID
|
||||
pub async fn get_by_entity_id(
|
||||
entity_id: &RecordId,
|
||||
db: &SurrealDbClient,
|
||||
) -> Result<Option<Self>, AppError> {
|
||||
let query = format!(
|
||||
"SELECT * FROM {} WHERE entity_id = $entity_id LIMIT 1",
|
||||
Self::table_name()
|
||||
);
|
||||
let mut result = db
|
||||
.client
|
||||
.query(query)
|
||||
.bind(("entity_id", entity_id.clone()))
|
||||
.await
|
||||
.map_err(AppError::Database)?;
|
||||
let embeddings: Vec<Self> = result.take(0).map_err(AppError::Database)?;
|
||||
Ok(embeddings.into_iter().next())
|
||||
}
|
||||
|
||||
/// Get embeddings for multiple entities in batch
|
||||
pub async fn get_by_entity_ids(
|
||||
entity_ids: &[RecordId],
|
||||
db: &SurrealDbClient,
|
||||
) -> Result<HashMap<String, Vec<f32>>, AppError> {
|
||||
if entity_ids.is_empty() {
|
||||
return Ok(HashMap::new());
|
||||
}
|
||||
|
||||
let ids_list: Vec<RecordId> = entity_ids.to_vec();
|
||||
|
||||
let query = format!(
|
||||
"SELECT * FROM {} WHERE entity_id INSIDE $entity_ids",
|
||||
Self::table_name()
|
||||
);
|
||||
let mut result = db
|
||||
.client
|
||||
.query(query)
|
||||
.bind(("entity_ids", ids_list))
|
||||
.await
|
||||
.map_err(AppError::Database)?;
|
||||
let embeddings: Vec<Self> = result.take(0).map_err(AppError::Database)?;
|
||||
|
||||
Ok(embeddings
|
||||
.into_iter()
|
||||
.map(|e| (e.entity_id.key().to_string(), e.embedding))
|
||||
.collect())
|
||||
}
|
||||
|
||||
/// Delete embedding by entity ID
|
||||
pub async fn delete_by_entity_id(
|
||||
entity_id: &RecordId,
|
||||
db: &SurrealDbClient,
|
||||
) -> Result<(), AppError> {
|
||||
let query = format!(
|
||||
"DELETE FROM {} WHERE entity_id = $entity_id",
|
||||
Self::table_name()
|
||||
);
|
||||
db.client
|
||||
.query(query)
|
||||
.bind(("entity_id", entity_id.clone()))
|
||||
.await
|
||||
.map_err(AppError::Database)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Delete embeddings by source_id (via joining to knowledge_entity table)
|
||||
#[allow(clippy::items_after_statements)]
|
||||
pub async fn delete_by_source_id(
|
||||
source_id: &str,
|
||||
db: &SurrealDbClient,
|
||||
) -> Result<(), AppError> {
|
||||
let query = "SELECT id FROM knowledge_entity WHERE source_id = $source_id";
|
||||
let mut res = db
|
||||
.client
|
||||
.query(query)
|
||||
.bind(("source_id", source_id.to_owned()))
|
||||
.await
|
||||
.map_err(AppError::Database)?;
|
||||
#[allow(clippy::missing_docs_in_private_items)]
|
||||
#[derive(Deserialize)]
|
||||
struct IdRow {
|
||||
id: RecordId,
|
||||
}
|
||||
let ids: Vec<IdRow> = res.take(0).map_err(AppError::Database)?;
|
||||
|
||||
for row in ids {
|
||||
Self::delete_by_entity_id(&row.id, db).await?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::storage::db::SurrealDbClient;
|
||||
use crate::storage::types::knowledge_entity::{KnowledgeEntity, KnowledgeEntityType};
|
||||
use chrono::Utc;
|
||||
use surrealdb::Value as SurrealValue;
|
||||
use uuid::Uuid;
|
||||
|
||||
async fn setup_test_db() -> SurrealDbClient {
|
||||
let namespace = "test_ns";
|
||||
let database = Uuid::new_v4().to_string();
|
||||
let db = SurrealDbClient::memory(namespace, &database)
|
||||
.await
|
||||
.expect("Failed to start in-memory surrealdb");
|
||||
|
||||
db.apply_migrations()
|
||||
.await
|
||||
.expect("Failed to apply migrations");
|
||||
|
||||
db
|
||||
}
|
||||
|
||||
fn build_knowledge_entity_with_id(
|
||||
key: &str,
|
||||
source_id: &str,
|
||||
user_id: &str,
|
||||
) -> KnowledgeEntity {
|
||||
KnowledgeEntity {
|
||||
id: key.to_owned(),
|
||||
created_at: Utc::now(),
|
||||
updated_at: Utc::now(),
|
||||
source_id: source_id.to_owned(),
|
||||
name: "Test entity".to_owned(),
|
||||
description: "Desc".to_owned(),
|
||||
entity_type: KnowledgeEntityType::Document,
|
||||
metadata: None,
|
||||
user_id: user_id.to_owned(),
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_create_and_get_by_entity_id() {
|
||||
let db = setup_test_db().await;
|
||||
KnowledgeEntityEmbedding::redefine_hnsw_index(&db, 3)
|
||||
.await
|
||||
.expect("set test index dimension");
|
||||
let user_id = "user_ke";
|
||||
let entity_key = "entity-1";
|
||||
let source_id = "source-ke";
|
||||
|
||||
let embedding_vec = vec![0.11_f32, 0.22, 0.33];
|
||||
let entity = build_knowledge_entity_with_id(entity_key, source_id, user_id);
|
||||
|
||||
KnowledgeEntity::store_with_embedding(entity.clone(), embedding_vec.clone(), &db)
|
||||
.await
|
||||
.expect("Failed to store entity with embedding");
|
||||
|
||||
let entity_rid = RecordId::from_table_key(KnowledgeEntity::table_name(), &entity.id);
|
||||
|
||||
let fetched = KnowledgeEntityEmbedding::get_by_entity_id(&entity_rid, &db)
|
||||
.await
|
||||
.expect("Failed to get embedding by entity_id")
|
||||
.expect("Expected embedding to exist");
|
||||
|
||||
assert_eq!(fetched.user_id, user_id);
|
||||
assert_eq!(fetched.entity_id, entity_rid);
|
||||
assert_eq!(fetched.embedding, embedding_vec);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_delete_by_entity_id() {
|
||||
let db = setup_test_db().await;
|
||||
KnowledgeEntityEmbedding::redefine_hnsw_index(&db, 3)
|
||||
.await
|
||||
.expect("set test index dimension");
|
||||
let user_id = "user_ke";
|
||||
let entity_key = "entity-delete";
|
||||
let source_id = "source-del";
|
||||
|
||||
let entity = build_knowledge_entity_with_id(entity_key, source_id, user_id);
|
||||
|
||||
KnowledgeEntity::store_with_embedding(entity.clone(), vec![0.5_f32, 0.6, 0.7], &db)
|
||||
.await
|
||||
.expect("Failed to store entity with embedding");
|
||||
|
||||
let entity_rid = RecordId::from_table_key(KnowledgeEntity::table_name(), &entity.id);
|
||||
|
||||
let existing = KnowledgeEntityEmbedding::get_by_entity_id(&entity_rid, &db)
|
||||
.await
|
||||
.expect("Failed to get embedding before delete");
|
||||
assert!(existing.is_some());
|
||||
|
||||
KnowledgeEntityEmbedding::delete_by_entity_id(&entity_rid, &db)
|
||||
.await
|
||||
.expect("Failed to delete by entity_id");
|
||||
|
||||
let after = KnowledgeEntityEmbedding::get_by_entity_id(&entity_rid, &db)
|
||||
.await
|
||||
.expect("Failed to get embedding after delete");
|
||||
assert!(after.is_none());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_store_with_embedding_creates_entity_and_embedding() {
|
||||
let db = setup_test_db().await;
|
||||
let user_id = "user_store";
|
||||
let source_id = "source_store";
|
||||
let embedding = vec![0.2_f32, 0.3, 0.4];
|
||||
|
||||
KnowledgeEntityEmbedding::redefine_hnsw_index(&db, embedding.len())
|
||||
.await
|
||||
.expect("set test index dimension");
|
||||
|
||||
let entity = build_knowledge_entity_with_id("entity-store", source_id, user_id);
|
||||
|
||||
KnowledgeEntity::store_with_embedding(entity.clone(), embedding.clone(), &db)
|
||||
.await
|
||||
.expect("Failed to store entity with embedding");
|
||||
|
||||
let stored_entity: Option<KnowledgeEntity> = db.get_item(&entity.id).await.unwrap();
|
||||
assert!(stored_entity.is_some());
|
||||
|
||||
let entity_rid = RecordId::from_table_key(KnowledgeEntity::table_name(), &entity.id);
|
||||
let stored_embedding = KnowledgeEntityEmbedding::get_by_entity_id(&entity_rid, &db)
|
||||
.await
|
||||
.expect("Failed to fetch embedding");
|
||||
assert!(stored_embedding.is_some());
|
||||
let stored_embedding = stored_embedding.unwrap();
|
||||
assert_eq!(stored_embedding.user_id, user_id);
|
||||
assert_eq!(stored_embedding.entity_id, entity_rid);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_delete_by_source_id() {
|
||||
let db = setup_test_db().await;
|
||||
KnowledgeEntityEmbedding::redefine_hnsw_index(&db, 3)
|
||||
.await
|
||||
.expect("set test index dimension");
|
||||
let user_id = "user_ke";
|
||||
let source_id = "shared-ke";
|
||||
let other_source = "other-ke";
|
||||
|
||||
let entity1 = build_knowledge_entity_with_id("entity-s1", source_id, user_id);
|
||||
let entity2 = build_knowledge_entity_with_id("entity-s2", source_id, user_id);
|
||||
let entity_other = build_knowledge_entity_with_id("entity-other", other_source, user_id);
|
||||
|
||||
KnowledgeEntity::store_with_embedding(entity1.clone(), vec![1.0_f32, 1.1, 1.2], &db)
|
||||
.await
|
||||
.expect("Failed to store entity with embedding");
|
||||
KnowledgeEntity::store_with_embedding(entity2.clone(), vec![2.0_f32, 2.1, 2.2], &db)
|
||||
.await
|
||||
.expect("Failed to store entity with embedding");
|
||||
KnowledgeEntity::store_with_embedding(entity_other.clone(), vec![3.0_f32, 3.1, 3.2], &db)
|
||||
.await
|
||||
.expect("Failed to store entity with embedding");
|
||||
|
||||
let entity1_rid = RecordId::from_table_key(KnowledgeEntity::table_name(), &entity1.id);
|
||||
let entity2_rid = RecordId::from_table_key(KnowledgeEntity::table_name(), &entity2.id);
|
||||
let other_rid = RecordId::from_table_key(KnowledgeEntity::table_name(), &entity_other.id);
|
||||
|
||||
KnowledgeEntityEmbedding::delete_by_source_id(source_id, &db)
|
||||
.await
|
||||
.expect("Failed to delete by source_id");
|
||||
|
||||
assert!(
|
||||
KnowledgeEntityEmbedding::get_by_entity_id(&entity1_rid, &db)
|
||||
.await
|
||||
.unwrap()
|
||||
.is_none()
|
||||
);
|
||||
assert!(
|
||||
KnowledgeEntityEmbedding::get_by_entity_id(&entity2_rid, &db)
|
||||
.await
|
||||
.unwrap()
|
||||
.is_none()
|
||||
);
|
||||
assert!(KnowledgeEntityEmbedding::get_by_entity_id(&other_rid, &db)
|
||||
.await
|
||||
.unwrap()
|
||||
.is_some());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_redefine_hnsw_index_updates_dimension() {
|
||||
let db = setup_test_db().await;
|
||||
|
||||
KnowledgeEntityEmbedding::redefine_hnsw_index(&db, 16)
|
||||
.await
|
||||
.expect("failed to redefine index");
|
||||
|
||||
let mut info_res = db
|
||||
.client
|
||||
.query("INFO FOR TABLE knowledge_entity_embedding;")
|
||||
.await
|
||||
.expect("info query failed");
|
||||
let info: SurrealValue = info_res.take(0).expect("failed to take info result");
|
||||
let info_json: serde_json::Value =
|
||||
serde_json::to_value(info).expect("failed to convert info to json");
|
||||
let idx_sql = info_json["Object"]["indexes"]["Object"]
|
||||
["idx_embedding_knowledge_entity_embedding"]["Strand"]
|
||||
.as_str()
|
||||
.unwrap_or_default();
|
||||
|
||||
assert!(
|
||||
idx_sql.contains("DIMENSION 16"),
|
||||
"expected index definition to contain new dimension, got: {idx_sql}"
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_fetch_entity_via_record_id() {
|
||||
let db = setup_test_db().await;
|
||||
KnowledgeEntityEmbedding::redefine_hnsw_index(&db, 3)
|
||||
.await
|
||||
.expect("set test index dimension");
|
||||
let user_id = "user_ke";
|
||||
let entity_key = "entity-fetch";
|
||||
let source_id = "source-fetch";
|
||||
|
||||
let entity = build_knowledge_entity_with_id(entity_key, source_id, user_id);
|
||||
KnowledgeEntity::store_with_embedding(entity.clone(), vec![0.7_f32, 0.8, 0.9], &db)
|
||||
.await
|
||||
.expect("Failed to store entity with embedding");
|
||||
|
||||
let entity_rid = RecordId::from_table_key(KnowledgeEntity::table_name(), &entity.id);
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct Row {
|
||||
entity_id: KnowledgeEntity,
|
||||
}
|
||||
|
||||
let mut res = db
|
||||
.client
|
||||
.query(
|
||||
"SELECT entity_id FROM knowledge_entity_embedding WHERE entity_id = $id FETCH entity_id;",
|
||||
)
|
||||
.bind(("id", entity_rid.clone()))
|
||||
.await
|
||||
.expect("failed to fetch embedding with FETCH");
|
||||
let rows: Vec<Row> = res.take(0).expect("failed to deserialize fetch rows");
|
||||
|
||||
assert_eq!(rows.len(), 1);
|
||||
let fetched_entity = &rows[0].entity_id;
|
||||
assert_eq!(fetched_entity.id, entity_key);
|
||||
assert_eq!(fetched_entity.name, "Test entity");
|
||||
assert_eq!(fetched_entity.user_id, user_id);
|
||||
}
|
||||
}
|
||||
@@ -40,21 +40,28 @@ impl KnowledgeRelationship {
|
||||
}
|
||||
}
|
||||
pub async fn store_relationship(&self, db_client: &SurrealDbClient) -> Result<(), AppError> {
|
||||
let query = format!(
|
||||
r#"RELATE knowledge_entity:`{}`->relates_to:`{}`->knowledge_entity:`{}`
|
||||
SET
|
||||
metadata.user_id = '{}',
|
||||
metadata.source_id = '{}',
|
||||
metadata.relationship_type = '{}'"#,
|
||||
self.in_,
|
||||
self.id,
|
||||
self.out,
|
||||
self.metadata.user_id,
|
||||
self.metadata.source_id,
|
||||
self.metadata.relationship_type
|
||||
);
|
||||
|
||||
db_client.query(query).await?;
|
||||
db_client
|
||||
.client
|
||||
.query(
|
||||
r#"BEGIN TRANSACTION;
|
||||
LET $in_entity = type::thing('knowledge_entity', $in_id);
|
||||
LET $out_entity = type::thing('knowledge_entity', $out_id);
|
||||
LET $relation = type::thing('relates_to', $rel_id);
|
||||
DELETE type::thing('relates_to', $rel_id);
|
||||
RELATE $in_entity->$relation->$out_entity SET
|
||||
metadata.user_id = $user_id,
|
||||
metadata.source_id = $source_id,
|
||||
metadata.relationship_type = $relationship_type;
|
||||
COMMIT TRANSACTION;"#,
|
||||
)
|
||||
.bind(("rel_id", self.id.clone()))
|
||||
.bind(("in_id", self.in_.clone()))
|
||||
.bind(("out_id", self.out.clone()))
|
||||
.bind(("user_id", self.metadata.user_id.clone()))
|
||||
.bind(("source_id", self.metadata.source_id.clone()))
|
||||
.bind(("relationship_type", self.metadata.relationship_type.clone()))
|
||||
.await?
|
||||
.check()?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -63,25 +70,55 @@ impl KnowledgeRelationship {
|
||||
source_id: &str,
|
||||
db_client: &SurrealDbClient,
|
||||
) -> Result<(), AppError> {
|
||||
let query = format!(
|
||||
"DELETE knowledge_entity -> relates_to WHERE metadata.source_id = '{}'",
|
||||
source_id
|
||||
);
|
||||
|
||||
db_client.query(query).await?;
|
||||
db_client
|
||||
.client
|
||||
.query("DELETE FROM relates_to WHERE metadata.source_id = $source_id")
|
||||
.bind(("source_id", source_id.to_owned()))
|
||||
.await?
|
||||
.check()?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn delete_relationship_by_id(
|
||||
id: &str,
|
||||
user_id: &str,
|
||||
db_client: &SurrealDbClient,
|
||||
) -> Result<(), AppError> {
|
||||
let query = format!("DELETE relates_to:`{}`", id);
|
||||
let mut authorized_result = db_client
|
||||
.client
|
||||
.query(
|
||||
"SELECT * FROM relates_to WHERE id = type::thing('relates_to', $id) AND metadata.user_id = $user_id",
|
||||
)
|
||||
.bind(("id", id.to_owned()))
|
||||
.bind(("user_id", user_id.to_owned()))
|
||||
.await?;
|
||||
let authorized: Vec<KnowledgeRelationship> = authorized_result.take(0).unwrap_or_default();
|
||||
|
||||
db_client.query(query).await?;
|
||||
if authorized.is_empty() {
|
||||
let mut exists_result = db_client
|
||||
.client
|
||||
.query("SELECT * FROM type::thing('relates_to', $id)")
|
||||
.bind(("id", id.to_owned()))
|
||||
.await?;
|
||||
let existing: Option<KnowledgeRelationship> = exists_result.take(0)?;
|
||||
|
||||
Ok(())
|
||||
if existing.is_some() {
|
||||
Err(AppError::Auth(
|
||||
"Not authorized to delete relationship".into(),
|
||||
))
|
||||
} else {
|
||||
Err(AppError::NotFound(format!("Relationship {id} not found")))
|
||||
}
|
||||
} else {
|
||||
db_client
|
||||
.client
|
||||
.query("DELETE type::thing('relates_to', $id)")
|
||||
.bind(("id", id.to_owned()))
|
||||
.await?
|
||||
.check()?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -90,12 +127,39 @@ mod tests {
|
||||
use super::*;
|
||||
use crate::storage::types::knowledge_entity::{KnowledgeEntity, KnowledgeEntityType};
|
||||
|
||||
async fn setup_test_db() -> SurrealDbClient {
|
||||
let namespace = "test_ns";
|
||||
let database = &Uuid::new_v4().to_string();
|
||||
let db = SurrealDbClient::memory(namespace, database)
|
||||
.await
|
||||
.expect("Failed to start in-memory surrealdb");
|
||||
|
||||
db.apply_migrations()
|
||||
.await
|
||||
.expect("Failed to apply migrations");
|
||||
|
||||
db
|
||||
}
|
||||
|
||||
async fn get_relationship_by_id(
|
||||
relationship_id: &str,
|
||||
db_client: &SurrealDbClient,
|
||||
) -> Option<KnowledgeRelationship> {
|
||||
let mut result = db_client
|
||||
.client
|
||||
.query("SELECT * FROM type::thing('relates_to', $id)")
|
||||
.bind(("id", relationship_id.to_owned()))
|
||||
.await
|
||||
.expect("relationship query by id failed");
|
||||
|
||||
result.take(0).expect("failed to take relationship by id")
|
||||
}
|
||||
|
||||
// Helper function to create a test knowledge entity for the relationship tests
|
||||
async fn create_test_entity(name: &str, db_client: &SurrealDbClient) -> String {
|
||||
let source_id = "source123".to_string();
|
||||
let description = format!("Description for {}", name);
|
||||
let entity_type = KnowledgeEntityType::Document;
|
||||
let embedding = vec![0.1, 0.2, 0.3];
|
||||
let user_id = "user123".to_string();
|
||||
|
||||
let entity = KnowledgeEntity::new(
|
||||
@@ -104,7 +168,6 @@ mod tests {
|
||||
description,
|
||||
entity_type,
|
||||
None,
|
||||
embedding,
|
||||
user_id,
|
||||
);
|
||||
|
||||
@@ -141,13 +204,9 @@ mod tests {
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_store_relationship() {
|
||||
async fn test_store_and_verify_by_source_id() {
|
||||
// Setup in-memory database for testing
|
||||
let namespace = "test_ns";
|
||||
let database = &Uuid::new_v4().to_string();
|
||||
let db = SurrealDbClient::memory(namespace, database)
|
||||
.await
|
||||
.expect("Failed to start in-memory surrealdb");
|
||||
let db = setup_test_db().await;
|
||||
|
||||
// Create two entities to relate
|
||||
let entity1_id = create_test_entity("Entity 1", &db).await;
|
||||
@@ -161,7 +220,7 @@ mod tests {
|
||||
let relationship = KnowledgeRelationship::new(
|
||||
entity1_id.clone(),
|
||||
entity2_id.clone(),
|
||||
user_id,
|
||||
user_id.clone(),
|
||||
source_id.clone(),
|
||||
relationship_type,
|
||||
);
|
||||
@@ -172,30 +231,69 @@ mod tests {
|
||||
.await
|
||||
.expect("Failed to store relationship");
|
||||
|
||||
let persisted = get_relationship_by_id(&relationship.id, &db)
|
||||
.await
|
||||
.expect("Relationship should be retrievable by id");
|
||||
assert_eq!(persisted.in_, entity1_id);
|
||||
assert_eq!(persisted.out, entity2_id);
|
||||
assert_eq!(persisted.metadata.user_id, user_id);
|
||||
assert_eq!(persisted.metadata.source_id, source_id);
|
||||
|
||||
// Query to verify the relationship exists by checking for relationships with our source_id
|
||||
// This approach is more reliable than trying to look up by ID
|
||||
let check_query = format!(
|
||||
"SELECT * FROM relates_to WHERE metadata.source_id = '{}'",
|
||||
source_id
|
||||
);
|
||||
let mut check_result = db.query(check_query).await.expect("Check query failed");
|
||||
let mut check_result = db
|
||||
.query("SELECT * FROM relates_to WHERE metadata.source_id = $source_id")
|
||||
.bind(("source_id", source_id.clone()))
|
||||
.await
|
||||
.expect("Check query failed");
|
||||
let check_results: Vec<KnowledgeRelationship> = check_result.take(0).unwrap_or_default();
|
||||
|
||||
// Just verify that a relationship was created
|
||||
assert!(
|
||||
!check_results.is_empty(),
|
||||
"Relationship should exist in the database"
|
||||
assert_eq!(
|
||||
check_results.len(),
|
||||
1,
|
||||
"Expected one relationship for source_id"
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_delete_relationship_by_id() {
|
||||
// Setup in-memory database for testing
|
||||
let namespace = "test_ns";
|
||||
let database = &Uuid::new_v4().to_string();
|
||||
let db = SurrealDbClient::memory(namespace, database)
|
||||
async fn test_store_relationship_resists_query_injection() {
|
||||
let db = setup_test_db().await;
|
||||
|
||||
let entity1_id = create_test_entity("Entity 1", &db).await;
|
||||
let entity2_id = create_test_entity("Entity 2", &db).await;
|
||||
|
||||
let relationship = KnowledgeRelationship::new(
|
||||
entity1_id,
|
||||
entity2_id,
|
||||
"user'123".to_string(),
|
||||
"source123'; DELETE FROM relates_to; --".to_string(),
|
||||
"references'; UPDATE user SET admin = true; --".to_string(),
|
||||
);
|
||||
|
||||
relationship
|
||||
.store_relationship(&db)
|
||||
.await
|
||||
.expect("Failed to start in-memory surrealdb");
|
||||
.expect("store relationship should safely handle quote-containing values");
|
||||
|
||||
let mut res = db
|
||||
.client
|
||||
.query("SELECT * FROM relates_to WHERE id = type::thing('relates_to', $id)")
|
||||
.bind(("id", relationship.id.clone()))
|
||||
.await
|
||||
.expect("query relationship by id failed");
|
||||
let rows: Vec<KnowledgeRelationship> = res.take(0).expect("take rows");
|
||||
|
||||
assert_eq!(rows.len(), 1);
|
||||
assert_eq!(
|
||||
rows[0].metadata.source_id,
|
||||
"source123'; DELETE FROM relates_to; --"
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_store_and_delete_relationship() {
|
||||
// Setup in-memory database for testing
|
||||
let db = setup_test_db().await;
|
||||
|
||||
// Create two entities to relate
|
||||
let entity1_id = create_test_entity("Entity 1", &db).await;
|
||||
@@ -209,39 +307,118 @@ mod tests {
|
||||
let relationship = KnowledgeRelationship::new(
|
||||
entity1_id.clone(),
|
||||
entity2_id.clone(),
|
||||
user_id,
|
||||
user_id.clone(),
|
||||
source_id.clone(),
|
||||
relationship_type,
|
||||
);
|
||||
|
||||
// Store the relationship
|
||||
// Store relationship
|
||||
relationship
|
||||
.store_relationship(&db)
|
||||
.await
|
||||
.expect("Failed to store relationship");
|
||||
|
||||
// Delete the relationship by ID
|
||||
KnowledgeRelationship::delete_relationship_by_id(&relationship.id, &db)
|
||||
// Ensure relationship exists before deletion attempt
|
||||
let mut existing_before_delete = db
|
||||
.query(format!(
|
||||
"SELECT * FROM relates_to WHERE metadata.user_id = '{}' AND metadata.source_id = '{}'",
|
||||
user_id, source_id
|
||||
))
|
||||
.await
|
||||
.expect("Query failed");
|
||||
let before_results: Vec<KnowledgeRelationship> =
|
||||
existing_before_delete.take(0).unwrap_or_default();
|
||||
assert!(
|
||||
!before_results.is_empty(),
|
||||
"Relationship should exist before deletion"
|
||||
);
|
||||
|
||||
// Delete relationship by ID
|
||||
KnowledgeRelationship::delete_relationship_by_id(&relationship.id, &user_id, &db)
|
||||
.await
|
||||
.expect("Failed to delete relationship by ID");
|
||||
|
||||
// Query to verify the relationship was deleted
|
||||
let query = format!("SELECT * FROM relates_to WHERE id = '{}'", relationship.id);
|
||||
let mut result = db.query(query).await.expect("Query failed");
|
||||
// Query to verify relationship was deleted
|
||||
let mut result = db
|
||||
.query(format!(
|
||||
"SELECT * FROM relates_to WHERE metadata.user_id = '{}' AND metadata.source_id = '{}'",
|
||||
user_id, source_id
|
||||
))
|
||||
.await
|
||||
.expect("Query failed");
|
||||
let results: Vec<KnowledgeRelationship> = result.take(0).unwrap_or_default();
|
||||
|
||||
// Verify the relationship no longer exists
|
||||
// Verify relationship no longer exists
|
||||
assert!(results.is_empty(), "Relationship should be deleted");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_delete_relationships_by_source_id() {
|
||||
// Setup in-memory database for testing
|
||||
let namespace = "test_ns";
|
||||
let database = &Uuid::new_v4().to_string();
|
||||
let db = SurrealDbClient::memory(namespace, database)
|
||||
async fn test_delete_relationship_by_id_unauthorized() {
|
||||
let db = setup_test_db().await;
|
||||
|
||||
let entity1_id = create_test_entity("Entity 1", &db).await;
|
||||
let entity2_id = create_test_entity("Entity 2", &db).await;
|
||||
|
||||
let owner_user_id = "owner-user".to_string();
|
||||
let source_id = "source123".to_string();
|
||||
|
||||
let relationship = KnowledgeRelationship::new(
|
||||
entity1_id.clone(),
|
||||
entity2_id.clone(),
|
||||
owner_user_id.clone(),
|
||||
source_id,
|
||||
"references".to_string(),
|
||||
);
|
||||
|
||||
relationship
|
||||
.store_relationship(&db)
|
||||
.await
|
||||
.expect("Failed to start in-memory surrealdb");
|
||||
.expect("Failed to store relationship");
|
||||
|
||||
let mut before_attempt = db
|
||||
.query(format!(
|
||||
"SELECT * FROM relates_to WHERE metadata.user_id = '{}'",
|
||||
owner_user_id
|
||||
))
|
||||
.await
|
||||
.expect("Query failed");
|
||||
let before_results: Vec<KnowledgeRelationship> = before_attempt.take(0).unwrap_or_default();
|
||||
assert!(
|
||||
!before_results.is_empty(),
|
||||
"Relationship should exist before unauthorized delete attempt"
|
||||
);
|
||||
|
||||
let result = KnowledgeRelationship::delete_relationship_by_id(
|
||||
&relationship.id,
|
||||
"different-user",
|
||||
&db,
|
||||
)
|
||||
.await;
|
||||
|
||||
match result {
|
||||
Err(AppError::Auth(_)) => {}
|
||||
_ => panic!("Expected authorization error when deleting someone else's relationship"),
|
||||
}
|
||||
|
||||
let mut after_attempt = db
|
||||
.query(format!(
|
||||
"SELECT * FROM relates_to WHERE metadata.user_id = '{}'",
|
||||
owner_user_id
|
||||
))
|
||||
.await
|
||||
.expect("Query failed");
|
||||
let results: Vec<KnowledgeRelationship> = after_attempt.take(0).unwrap_or_default();
|
||||
|
||||
assert!(
|
||||
!results.is_empty(),
|
||||
"Relationship should still exist after unauthorized delete attempt"
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_store_relationship_exists() {
|
||||
// Setup in-memory database for testing
|
||||
let db = setup_test_db().await;
|
||||
|
||||
// Create entities to relate
|
||||
let entity1_id = create_test_entity("Entity 1", &db).await;
|
||||
@@ -293,49 +470,87 @@ mod tests {
|
||||
.await
|
||||
.expect("Failed to store different relationship");
|
||||
|
||||
// Sanity-check setup: exactly two relationships use source_id and one uses different_source_id.
|
||||
let mut before_delete = db
|
||||
.query("SELECT * FROM relates_to WHERE metadata.source_id = $source_id")
|
||||
.bind(("source_id", source_id.clone()))
|
||||
.await
|
||||
.expect("before delete query failed");
|
||||
let before_delete_rows: Vec<KnowledgeRelationship> =
|
||||
before_delete.take(0).unwrap_or_default();
|
||||
assert_eq!(before_delete_rows.len(), 2);
|
||||
|
||||
let mut before_delete_different = db
|
||||
.query("SELECT * FROM relates_to WHERE metadata.source_id = $source_id")
|
||||
.bind(("source_id", different_source_id.clone()))
|
||||
.await
|
||||
.expect("before delete different query failed");
|
||||
let before_delete_different_rows: Vec<KnowledgeRelationship> =
|
||||
before_delete_different.take(0).unwrap_or_default();
|
||||
assert_eq!(before_delete_different_rows.len(), 1);
|
||||
|
||||
// Delete relationships by source_id
|
||||
KnowledgeRelationship::delete_relationships_by_source_id(&source_id, &db)
|
||||
.await
|
||||
.expect("Failed to delete relationships by source_id");
|
||||
|
||||
// Query to verify the relationships with source_id were deleted
|
||||
let query1 = format!("SELECT * FROM relates_to WHERE id = '{}'", relationship1.id);
|
||||
let query2 = format!("SELECT * FROM relates_to WHERE id = '{}'", relationship2.id);
|
||||
let different_query = format!(
|
||||
"SELECT * FROM relates_to WHERE id = '{}'",
|
||||
different_relationship.id
|
||||
);
|
||||
|
||||
let mut result1 = db.query(query1).await.expect("Query 1 failed");
|
||||
let results1: Vec<KnowledgeRelationship> = result1.take(0).unwrap_or_default();
|
||||
|
||||
let mut result2 = db.query(query2).await.expect("Query 2 failed");
|
||||
let results2: Vec<KnowledgeRelationship> = result2.take(0).unwrap_or_default();
|
||||
|
||||
let mut different_result = db
|
||||
.query(different_query)
|
||||
.await
|
||||
.expect("Different query failed");
|
||||
let _different_results: Vec<KnowledgeRelationship> =
|
||||
different_result.take(0).unwrap_or_default();
|
||||
// Query to verify the specific relationships with source_id were deleted.
|
||||
let result1 = get_relationship_by_id(&relationship1.id, &db).await;
|
||||
let result2 = get_relationship_by_id(&relationship2.id, &db).await;
|
||||
let different_result = get_relationship_by_id(&different_relationship.id, &db).await;
|
||||
|
||||
// Verify relationships with the source_id are deleted
|
||||
assert!(results1.is_empty(), "Relationship 1 should be deleted");
|
||||
assert!(results2.is_empty(), "Relationship 2 should be deleted");
|
||||
assert!(result1.is_none(), "Relationship 1 should be deleted");
|
||||
assert!(result2.is_none(), "Relationship 2 should be deleted");
|
||||
let remaining =
|
||||
different_result.expect("Relationship with different source_id should remain");
|
||||
assert_eq!(remaining.metadata.source_id, different_source_id);
|
||||
}
|
||||
|
||||
// For the relationship with different source ID, we need to check differently
|
||||
// Let's just verify we have a relationship where the source_id matches different_source_id
|
||||
let check_query = format!(
|
||||
"SELECT * FROM relates_to WHERE metadata.source_id = '{}'",
|
||||
different_source_id
|
||||
#[tokio::test]
|
||||
async fn test_delete_relationships_by_source_id_resists_query_injection() {
|
||||
let db = setup_test_db().await;
|
||||
|
||||
let entity1_id = create_test_entity("Entity 1", &db).await;
|
||||
let entity2_id = create_test_entity("Entity 2", &db).await;
|
||||
let entity3_id = create_test_entity("Entity 3", &db).await;
|
||||
|
||||
let safe_relationship = KnowledgeRelationship::new(
|
||||
entity1_id.clone(),
|
||||
entity2_id.clone(),
|
||||
"user123".to_string(),
|
||||
"safe_source".to_string(),
|
||||
"references".to_string(),
|
||||
);
|
||||
let mut check_result = db.query(check_query).await.expect("Check query failed");
|
||||
let check_results: Vec<KnowledgeRelationship> = check_result.take(0).unwrap_or_default();
|
||||
|
||||
// Verify the relationship with a different source_id still exists
|
||||
let other_relationship = KnowledgeRelationship::new(
|
||||
entity2_id,
|
||||
entity3_id,
|
||||
"user123".to_string(),
|
||||
"other_source".to_string(),
|
||||
"contains".to_string(),
|
||||
);
|
||||
|
||||
safe_relationship
|
||||
.store_relationship(&db)
|
||||
.await
|
||||
.expect("store safe relationship");
|
||||
other_relationship
|
||||
.store_relationship(&db)
|
||||
.await
|
||||
.expect("store other relationship");
|
||||
|
||||
KnowledgeRelationship::delete_relationships_by_source_id("safe_source' OR 1=1 --", &db)
|
||||
.await
|
||||
.expect("delete call should succeed");
|
||||
|
||||
let remaining_safe = get_relationship_by_id(&safe_relationship.id, &db).await;
|
||||
let remaining_other = get_relationship_by_id(&other_relationship.id, &db).await;
|
||||
|
||||
assert!(remaining_safe.is_some(), "Safe relationship should remain");
|
||||
assert!(
|
||||
!check_results.is_empty(),
|
||||
"Relationship with different source_id should still exist"
|
||||
remaining_other.is_some(),
|
||||
"Other relationship should remain"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
#![allow(clippy::module_name_repetitions)]
|
||||
use uuid::Uuid;
|
||||
|
||||
use crate::stored_object;
|
||||
@@ -56,7 +57,7 @@ impl fmt::Display for Message {
|
||||
pub fn format_history(history: &[Message]) -> String {
|
||||
history
|
||||
.iter()
|
||||
.map(|msg| format!("{}", msg))
|
||||
.map(|msg| format!("{msg}"))
|
||||
.collect::<Vec<String>>()
|
||||
.join("\n")
|
||||
}
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
#![allow(clippy::unsafe_derive_deserialize)]
|
||||
use serde::{Deserialize, Serialize};
|
||||
pub mod analytics;
|
||||
pub mod conversation;
|
||||
@@ -5,11 +6,14 @@ pub mod file_info;
|
||||
pub mod ingestion_payload;
|
||||
pub mod ingestion_task;
|
||||
pub mod knowledge_entity;
|
||||
pub mod knowledge_entity_embedding;
|
||||
pub mod knowledge_relationship;
|
||||
pub mod message;
|
||||
pub mod scratchpad;
|
||||
pub mod system_prompts;
|
||||
pub mod system_settings;
|
||||
pub mod text_chunk;
|
||||
pub mod text_chunk_embedding;
|
||||
pub mod text_content;
|
||||
pub mod user;
|
||||
|
||||
@@ -20,7 +24,7 @@ pub trait StoredObject: Serialize + for<'de> Deserialize<'de> {
|
||||
|
||||
#[macro_export]
|
||||
macro_rules! stored_object {
|
||||
($name:ident, $table:expr, {$($(#[$attr:meta])* $field:ident: $ty:ty),*}) => {
|
||||
($(#[$struct_attr:meta])* $name:ident, $table:expr, {$($(#[$field_attr:meta])* $field:ident: $ty:ty),*}) => {
|
||||
use serde::{Deserialize, Deserializer, Serialize};
|
||||
use surrealdb::sql::Thing;
|
||||
use $crate::storage::types::StoredObject;
|
||||
@@ -83,7 +87,36 @@ macro_rules! stored_object {
|
||||
Ok(DateTime::<Utc>::from(dt))
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
#[allow(clippy::ref_option)]
|
||||
fn serialize_option_datetime<S>(
|
||||
date: &Option<DateTime<Utc>>,
|
||||
serializer: S,
|
||||
) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: serde::Serializer,
|
||||
{
|
||||
match date {
|
||||
Some(dt) => serializer
|
||||
.serialize_some(&Into::<surrealdb::sql::Datetime>::into(*dt)),
|
||||
None => serializer.serialize_none(),
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
#[allow(clippy::ref_option)]
|
||||
fn deserialize_option_datetime<'de, D>(
|
||||
deserializer: D,
|
||||
) -> Result<Option<DateTime<Utc>>, D::Error>
|
||||
where
|
||||
D: serde::Deserializer<'de>,
|
||||
{
|
||||
let value = Option::<surrealdb::sql::Datetime>::deserialize(deserializer)?;
|
||||
Ok(value.map(DateTime::<Utc>::from))
|
||||
}
|
||||
|
||||
|
||||
$(#[$struct_attr])*
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
|
||||
pub struct $name {
|
||||
#[serde(deserialize_with = "deserialize_flexible_id")]
|
||||
@@ -92,7 +125,7 @@ macro_rules! stored_object {
|
||||
pub created_at: DateTime<Utc>,
|
||||
#[serde(serialize_with = "serialize_datetime", deserialize_with = "deserialize_datetime", default)]
|
||||
pub updated_at: DateTime<Utc>,
|
||||
$(pub $field: $ty),*
|
||||
$( $(#[$field_attr])* pub $field: $ty),*
|
||||
}
|
||||
|
||||
impl StoredObject for $name {
|
||||
|
||||
502
common/src/storage/types/scratchpad.rs
Normal file
502
common/src/storage/types/scratchpad.rs
Normal file
@@ -0,0 +1,502 @@
|
||||
use chrono::Utc as ChronoUtc;
|
||||
use surrealdb::opt::PatchOp;
|
||||
use uuid::Uuid;
|
||||
|
||||
use crate::{error::AppError, storage::db::SurrealDbClient, stored_object};
|
||||
|
||||
stored_object!(Scratchpad, "scratchpad", {
|
||||
user_id: String,
|
||||
title: String,
|
||||
content: String,
|
||||
#[serde(serialize_with = "serialize_datetime", deserialize_with="deserialize_datetime")]
|
||||
last_saved_at: DateTime<Utc>,
|
||||
is_dirty: bool,
|
||||
#[serde(default)]
|
||||
is_archived: bool,
|
||||
#[serde(
|
||||
serialize_with = "serialize_option_datetime",
|
||||
deserialize_with = "deserialize_option_datetime",
|
||||
default
|
||||
)]
|
||||
archived_at: Option<DateTime<Utc>>,
|
||||
#[serde(
|
||||
serialize_with = "serialize_option_datetime",
|
||||
deserialize_with = "deserialize_option_datetime",
|
||||
default
|
||||
)]
|
||||
ingested_at: Option<DateTime<Utc>>
|
||||
});
|
||||
|
||||
impl Scratchpad {
|
||||
pub fn new(user_id: String, title: String) -> Self {
|
||||
let now = ChronoUtc::now();
|
||||
Self {
|
||||
id: Uuid::new_v4().to_string(),
|
||||
created_at: now,
|
||||
updated_at: now,
|
||||
user_id,
|
||||
title,
|
||||
content: String::new(),
|
||||
last_saved_at: now,
|
||||
is_dirty: false,
|
||||
is_archived: false,
|
||||
archived_at: None,
|
||||
ingested_at: None,
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn get_by_user(user_id: &str, db: &SurrealDbClient) -> Result<Vec<Self>, AppError> {
|
||||
let scratchpads: Vec<Scratchpad> = db.client
|
||||
.query("SELECT * FROM type::table($table_name) WHERE user_id = $user_id AND (is_archived = false OR is_archived IS NONE) ORDER BY updated_at DESC")
|
||||
.bind(("table_name", Self::table_name()))
|
||||
.bind(("user_id", user_id.to_string()))
|
||||
.await?
|
||||
.take(0)?;
|
||||
|
||||
Ok(scratchpads)
|
||||
}
|
||||
|
||||
pub async fn get_archived_by_user(
|
||||
user_id: &str,
|
||||
db: &SurrealDbClient,
|
||||
) -> Result<Vec<Self>, AppError> {
|
||||
let scratchpads: Vec<Scratchpad> = db.client
|
||||
.query("SELECT * FROM type::table($table_name) WHERE user_id = $user_id AND is_archived = true ORDER BY archived_at DESC, updated_at DESC")
|
||||
.bind(("table_name", Self::table_name()))
|
||||
.bind(("user_id", user_id.to_string()))
|
||||
.await?
|
||||
.take(0)?;
|
||||
|
||||
Ok(scratchpads)
|
||||
}
|
||||
|
||||
pub async fn get_by_id(
|
||||
id: &str,
|
||||
user_id: &str,
|
||||
db: &SurrealDbClient,
|
||||
) -> Result<Self, AppError> {
|
||||
let scratchpad: Option<Scratchpad> = db.get_item(id).await?;
|
||||
|
||||
let scratchpad =
|
||||
scratchpad.ok_or_else(|| AppError::NotFound("Scratchpad not found".to_string()))?;
|
||||
|
||||
if scratchpad.user_id != user_id {
|
||||
return Err(AppError::Auth(
|
||||
"You don't have access to this scratchpad".to_string(),
|
||||
));
|
||||
}
|
||||
|
||||
Ok(scratchpad)
|
||||
}
|
||||
|
||||
pub async fn update_content(
|
||||
id: &str,
|
||||
user_id: &str,
|
||||
new_content: &str,
|
||||
db: &SurrealDbClient,
|
||||
) -> Result<Self, AppError> {
|
||||
// First verify ownership
|
||||
let scratchpad = Self::get_by_id(id, user_id, db).await?;
|
||||
|
||||
if scratchpad.is_archived {
|
||||
return Ok(scratchpad);
|
||||
}
|
||||
|
||||
let now = ChronoUtc::now();
|
||||
let _updated: Option<Self> = db
|
||||
.update((Self::table_name(), id))
|
||||
.patch(PatchOp::replace("/content", new_content.to_string()))
|
||||
.patch(PatchOp::replace(
|
||||
"/updated_at",
|
||||
surrealdb::Datetime::from(now),
|
||||
))
|
||||
.patch(PatchOp::replace(
|
||||
"/last_saved_at",
|
||||
surrealdb::Datetime::from(now),
|
||||
))
|
||||
.patch(PatchOp::replace("/is_dirty", false))
|
||||
.await?;
|
||||
|
||||
// Return the updated scratchpad
|
||||
Self::get_by_id(id, user_id, db).await
|
||||
}
|
||||
|
||||
pub async fn update_title(
|
||||
id: &str,
|
||||
user_id: &str,
|
||||
new_title: &str,
|
||||
db: &SurrealDbClient,
|
||||
) -> Result<(), AppError> {
|
||||
// First verify ownership
|
||||
let _scratchpad = Self::get_by_id(id, user_id, db).await?;
|
||||
|
||||
let _updated: Option<Self> = db
|
||||
.update((Self::table_name(), id))
|
||||
.patch(PatchOp::replace("/title", new_title.to_string()))
|
||||
.patch(PatchOp::replace(
|
||||
"/updated_at",
|
||||
surrealdb::Datetime::from(ChronoUtc::now()),
|
||||
))
|
||||
.await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn delete(id: &str, user_id: &str, db: &SurrealDbClient) -> Result<(), AppError> {
|
||||
// First verify ownership
|
||||
let _scratchpad = Self::get_by_id(id, user_id, db).await?;
|
||||
|
||||
let _: Option<Self> = db.client.delete((Self::table_name(), id)).await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn archive(
|
||||
id: &str,
|
||||
user_id: &str,
|
||||
db: &SurrealDbClient,
|
||||
mark_ingested: bool,
|
||||
) -> Result<Self, AppError> {
|
||||
// Verify ownership
|
||||
let scratchpad = Self::get_by_id(id, user_id, db).await?;
|
||||
|
||||
if scratchpad.is_archived {
|
||||
if mark_ingested && scratchpad.ingested_at.is_none() {
|
||||
// Ensure ingested_at is set if required
|
||||
let surreal_now = surrealdb::Datetime::from(ChronoUtc::now());
|
||||
let _updated: Option<Self> = db
|
||||
.update((Self::table_name(), id))
|
||||
.patch(PatchOp::replace("/ingested_at", surreal_now))
|
||||
.await?;
|
||||
return Self::get_by_id(id, user_id, db).await;
|
||||
}
|
||||
return Ok(scratchpad);
|
||||
}
|
||||
|
||||
let now = ChronoUtc::now();
|
||||
let surreal_now = surrealdb::Datetime::from(now);
|
||||
let mut update = db
|
||||
.update((Self::table_name(), id))
|
||||
.patch(PatchOp::replace("/is_archived", true))
|
||||
.patch(PatchOp::replace("/archived_at", surreal_now.clone()))
|
||||
.patch(PatchOp::replace("/updated_at", surreal_now.clone()));
|
||||
|
||||
update = if mark_ingested {
|
||||
update.patch(PatchOp::replace("/ingested_at", surreal_now))
|
||||
} else {
|
||||
update.patch(PatchOp::remove("/ingested_at"))
|
||||
};
|
||||
|
||||
let _updated: Option<Self> = update.await?;
|
||||
|
||||
Self::get_by_id(id, user_id, db).await
|
||||
}
|
||||
|
||||
pub async fn restore(id: &str, user_id: &str, db: &SurrealDbClient) -> Result<Self, AppError> {
|
||||
// Verify ownership
|
||||
let scratchpad = Self::get_by_id(id, user_id, db).await?;
|
||||
|
||||
if !scratchpad.is_archived {
|
||||
return Ok(scratchpad);
|
||||
}
|
||||
|
||||
let now = ChronoUtc::now();
|
||||
let surreal_now = surrealdb::Datetime::from(now);
|
||||
let _updated: Option<Self> = db
|
||||
.update((Self::table_name(), id))
|
||||
.patch(PatchOp::replace("/is_archived", false))
|
||||
.patch(PatchOp::remove("/archived_at"))
|
||||
.patch(PatchOp::remove("/ingested_at"))
|
||||
.patch(PatchOp::replace("/updated_at", surreal_now))
|
||||
.await?;
|
||||
|
||||
Self::get_by_id(id, user_id, db).await
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_create_scratchpad() {
|
||||
// Setup in-memory database for testing
|
||||
let namespace = "test_ns";
|
||||
let database = &Uuid::new_v4().to_string();
|
||||
let db = SurrealDbClient::memory(namespace, database)
|
||||
.await
|
||||
.expect("Failed to start in-memory surrealdb");
|
||||
|
||||
db.apply_migrations()
|
||||
.await
|
||||
.expect("Failed to apply migrations");
|
||||
|
||||
// Create a new scratchpad
|
||||
let user_id = "test_user";
|
||||
let title = "Test Scratchpad";
|
||||
let scratchpad = Scratchpad::new(user_id.to_string(), title.to_string());
|
||||
|
||||
// Verify scratchpad properties
|
||||
assert_eq!(scratchpad.user_id, user_id);
|
||||
assert_eq!(scratchpad.title, title);
|
||||
assert_eq!(scratchpad.content, "");
|
||||
assert!(!scratchpad.is_dirty);
|
||||
assert!(!scratchpad.is_archived);
|
||||
assert!(scratchpad.archived_at.is_none());
|
||||
assert!(scratchpad.ingested_at.is_none());
|
||||
assert!(!scratchpad.id.is_empty());
|
||||
|
||||
// Store the scratchpad
|
||||
let result = db.store_item(scratchpad.clone()).await;
|
||||
assert!(result.is_ok());
|
||||
|
||||
// Verify it can be retrieved
|
||||
let retrieved: Option<Scratchpad> = db
|
||||
.get_item(&scratchpad.id)
|
||||
.await
|
||||
.expect("Failed to retrieve scratchpad");
|
||||
assert!(retrieved.is_some());
|
||||
|
||||
let retrieved = retrieved.unwrap();
|
||||
assert_eq!(retrieved.id, scratchpad.id);
|
||||
assert_eq!(retrieved.user_id, user_id);
|
||||
assert_eq!(retrieved.title, title);
|
||||
assert!(!retrieved.is_archived);
|
||||
assert!(retrieved.archived_at.is_none());
|
||||
assert!(retrieved.ingested_at.is_none());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_get_by_user() {
|
||||
let namespace = "test_ns";
|
||||
let database = &Uuid::new_v4().to_string();
|
||||
let db = SurrealDbClient::memory(namespace, database)
|
||||
.await
|
||||
.expect("Failed to start in-memory surrealdb");
|
||||
|
||||
db.apply_migrations()
|
||||
.await
|
||||
.expect("Failed to apply migrations");
|
||||
|
||||
let user_id = "test_user";
|
||||
|
||||
// Create multiple scratchpads
|
||||
let scratchpad1 = Scratchpad::new(user_id.to_string(), "First".to_string());
|
||||
let scratchpad2 = Scratchpad::new(user_id.to_string(), "Second".to_string());
|
||||
let scratchpad3 = Scratchpad::new("other_user".to_string(), "Other".to_string());
|
||||
|
||||
// Store them
|
||||
let scratchpad1_id = scratchpad1.id.clone();
|
||||
let scratchpad2_id = scratchpad2.id.clone();
|
||||
db.store_item(scratchpad1).await.unwrap();
|
||||
db.store_item(scratchpad2).await.unwrap();
|
||||
db.store_item(scratchpad3).await.unwrap();
|
||||
|
||||
// Archive one of the user's scratchpads
|
||||
Scratchpad::archive(&scratchpad2_id, user_id, &db, false)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// Get scratchpads for user_id
|
||||
let user_scratchpads = Scratchpad::get_by_user(user_id, &db).await.unwrap();
|
||||
assert_eq!(user_scratchpads.len(), 1);
|
||||
assert_eq!(user_scratchpads[0].id, scratchpad1_id);
|
||||
|
||||
// Verify they belong to the user
|
||||
for scratchpad in &user_scratchpads {
|
||||
assert_eq!(scratchpad.user_id, user_id);
|
||||
}
|
||||
|
||||
let archived = Scratchpad::get_archived_by_user(user_id, &db)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(archived.len(), 1);
|
||||
assert_eq!(archived[0].id, scratchpad2_id);
|
||||
assert!(archived[0].is_archived);
|
||||
assert!(archived[0].ingested_at.is_none());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_archive_and_restore() {
|
||||
let namespace = "test_ns";
|
||||
let database = &Uuid::new_v4().to_string();
|
||||
let db = SurrealDbClient::memory(namespace, database)
|
||||
.await
|
||||
.expect("Failed to start in-memory surrealdb");
|
||||
|
||||
db.apply_migrations()
|
||||
.await
|
||||
.expect("Failed to apply migrations");
|
||||
|
||||
let user_id = "test_user";
|
||||
let scratchpad = Scratchpad::new(user_id.to_string(), "Test".to_string());
|
||||
let scratchpad_id = scratchpad.id.clone();
|
||||
db.store_item(scratchpad).await.unwrap();
|
||||
|
||||
let archived = Scratchpad::archive(&scratchpad_id, user_id, &db, true)
|
||||
.await
|
||||
.expect("Failed to archive");
|
||||
assert!(archived.is_archived);
|
||||
assert!(archived.archived_at.is_some());
|
||||
assert!(archived.ingested_at.is_some());
|
||||
|
||||
let restored = Scratchpad::restore(&scratchpad_id, user_id, &db)
|
||||
.await
|
||||
.expect("Failed to restore");
|
||||
assert!(!restored.is_archived);
|
||||
assert!(restored.archived_at.is_none());
|
||||
assert!(restored.ingested_at.is_none());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_update_content() {
|
||||
let namespace = "test_ns";
|
||||
let database = &Uuid::new_v4().to_string();
|
||||
let db = SurrealDbClient::memory(namespace, database)
|
||||
.await
|
||||
.expect("Failed to start in-memory surrealdb");
|
||||
|
||||
db.apply_migrations()
|
||||
.await
|
||||
.expect("Failed to apply migrations");
|
||||
|
||||
let user_id = "test_user";
|
||||
let scratchpad = Scratchpad::new(user_id.to_string(), "Test".to_string());
|
||||
let scratchpad_id = scratchpad.id.clone();
|
||||
|
||||
db.store_item(scratchpad).await.unwrap();
|
||||
|
||||
let new_content = "Updated content";
|
||||
let updated = Scratchpad::update_content(&scratchpad_id, user_id, new_content, &db)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(updated.content, new_content);
|
||||
assert!(!updated.is_dirty);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_update_content_unauthorized() {
|
||||
let namespace = "test_ns";
|
||||
let database = &Uuid::new_v4().to_string();
|
||||
let db = SurrealDbClient::memory(namespace, database)
|
||||
.await
|
||||
.expect("Failed to start in-memory surrealdb");
|
||||
|
||||
db.apply_migrations()
|
||||
.await
|
||||
.expect("Failed to apply migrations");
|
||||
|
||||
let owner_id = "owner";
|
||||
let other_user = "other_user";
|
||||
let scratchpad = Scratchpad::new(owner_id.to_string(), "Test".to_string());
|
||||
let scratchpad_id = scratchpad.id.clone();
|
||||
|
||||
db.store_item(scratchpad).await.unwrap();
|
||||
|
||||
let result = Scratchpad::update_content(&scratchpad_id, other_user, "Hacked", &db).await;
|
||||
assert!(result.is_err());
|
||||
match result {
|
||||
Err(AppError::Auth(_)) => {}
|
||||
_ => panic!("Expected Auth error"),
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_delete_scratchpad() {
|
||||
let namespace = "test_ns";
|
||||
let database = &Uuid::new_v4().to_string();
|
||||
let db = SurrealDbClient::memory(namespace, database)
|
||||
.await
|
||||
.expect("Failed to start in-memory surrealdb");
|
||||
|
||||
db.apply_migrations()
|
||||
.await
|
||||
.expect("Failed to apply migrations");
|
||||
|
||||
let user_id = "test_user";
|
||||
let scratchpad = Scratchpad::new(user_id.to_string(), "Test".to_string());
|
||||
let scratchpad_id = scratchpad.id.clone();
|
||||
|
||||
db.store_item(scratchpad).await.unwrap();
|
||||
|
||||
// Delete should succeed
|
||||
let result = Scratchpad::delete(&scratchpad_id, user_id, &db).await;
|
||||
assert!(result.is_ok());
|
||||
|
||||
// Verify it's gone
|
||||
let retrieved: Option<Scratchpad> = db.get_item(&scratchpad_id).await.unwrap();
|
||||
assert!(retrieved.is_none());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_delete_unauthorized() {
|
||||
let namespace = "test_ns";
|
||||
let database = &Uuid::new_v4().to_string();
|
||||
let db = SurrealDbClient::memory(namespace, database)
|
||||
.await
|
||||
.expect("Failed to start in-memory surrealdb");
|
||||
|
||||
db.apply_migrations()
|
||||
.await
|
||||
.expect("Failed to apply migrations");
|
||||
|
||||
let owner_id = "owner";
|
||||
let other_user = "other_user";
|
||||
let scratchpad = Scratchpad::new(owner_id.to_string(), "Test".to_string());
|
||||
let scratchpad_id = scratchpad.id.clone();
|
||||
|
||||
db.store_item(scratchpad).await.unwrap();
|
||||
|
||||
let result = Scratchpad::delete(&scratchpad_id, other_user, &db).await;
|
||||
assert!(result.is_err());
|
||||
match result {
|
||||
Err(AppError::Auth(_)) => {}
|
||||
_ => panic!("Expected Auth error"),
|
||||
}
|
||||
|
||||
// Verify it still exists
|
||||
let retrieved: Option<Scratchpad> = db.get_item(&scratchpad_id).await.unwrap();
|
||||
assert!(retrieved.is_some());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_timezone_aware_scratchpad_conversion() {
|
||||
let db = SurrealDbClient::memory("test_ns", &Uuid::new_v4().to_string())
|
||||
.await
|
||||
.expect("Failed to create test database");
|
||||
|
||||
db.apply_migrations()
|
||||
.await
|
||||
.expect("Failed to apply migrations");
|
||||
|
||||
let user_id = "test_user_123";
|
||||
let scratchpad =
|
||||
Scratchpad::new(user_id.to_string(), "Test Timezone Scratchpad".to_string());
|
||||
let scratchpad_id = scratchpad.id.clone();
|
||||
|
||||
db.store_item(scratchpad).await.unwrap();
|
||||
|
||||
let retrieved = Scratchpad::get_by_id(&scratchpad_id, user_id, &db)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// Test that datetime fields are preserved and can be used for timezone formatting
|
||||
assert!(retrieved.created_at.timestamp() > 0);
|
||||
assert!(retrieved.updated_at.timestamp() > 0);
|
||||
assert!(retrieved.last_saved_at.timestamp() > 0);
|
||||
|
||||
// Test that optional datetime fields work correctly
|
||||
assert!(retrieved.archived_at.is_none());
|
||||
assert!(retrieved.ingested_at.is_none());
|
||||
|
||||
// Archive the scratchpad to test optional datetime handling
|
||||
let archived = Scratchpad::archive(&scratchpad_id, user_id, &db, false)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
assert!(archived.archived_at.is_some());
|
||||
assert!(archived.archived_at.unwrap().timestamp() > 0);
|
||||
assert!(archived.ingested_at.is_none());
|
||||
}
|
||||
}
|
||||
@@ -13,6 +13,9 @@ pub struct SystemSettings {
|
||||
pub processing_model: String,
|
||||
pub embedding_model: String,
|
||||
pub embedding_dimensions: u32,
|
||||
/// Active embedding backend ("openai", "fastembed", "hashed"). Read-only, synced from config.
|
||||
#[serde(default)]
|
||||
pub embedding_backend: Option<String>,
|
||||
pub query_system_prompt: String,
|
||||
pub ingestion_system_prompt: String,
|
||||
pub image_processing_model: String,
|
||||
@@ -49,15 +52,113 @@ impl SystemSettings {
|
||||
"Something went wrong updating the settings".into(),
|
||||
))
|
||||
}
|
||||
|
||||
/// Syncs SystemSettings with the active embedding provider's properties.
|
||||
/// Updates embedding_backend, embedding_model, and embedding_dimensions if they differ.
|
||||
/// Returns true if any settings were changed.
|
||||
pub async fn sync_from_embedding_provider(
|
||||
db: &SurrealDbClient,
|
||||
provider: &crate::utils::embedding::EmbeddingProvider,
|
||||
) -> Result<(Self, bool), AppError> {
|
||||
let mut settings = Self::get_current(db).await?;
|
||||
let mut needs_update = false;
|
||||
|
||||
let backend_label = provider.backend_label().to_string();
|
||||
let provider_dimensions = provider.dimension() as u32;
|
||||
let provider_model = provider.model_code();
|
||||
|
||||
// Sync backend label
|
||||
if settings.embedding_backend.as_deref() != Some(&backend_label) {
|
||||
settings.embedding_backend = Some(backend_label);
|
||||
needs_update = true;
|
||||
}
|
||||
|
||||
// Sync dimensions
|
||||
if settings.embedding_dimensions != provider_dimensions {
|
||||
tracing::info!(
|
||||
old_dimensions = settings.embedding_dimensions,
|
||||
new_dimensions = provider_dimensions,
|
||||
"Embedding dimensions changed, updating SystemSettings"
|
||||
);
|
||||
settings.embedding_dimensions = provider_dimensions;
|
||||
needs_update = true;
|
||||
}
|
||||
|
||||
// Sync model if provider has one
|
||||
if let Some(model) = provider_model {
|
||||
if settings.embedding_model != model {
|
||||
tracing::info!(
|
||||
old_model = %settings.embedding_model,
|
||||
new_model = %model,
|
||||
"Embedding model changed, updating SystemSettings"
|
||||
);
|
||||
settings.embedding_model = model;
|
||||
needs_update = true;
|
||||
}
|
||||
}
|
||||
|
||||
if needs_update {
|
||||
settings = Self::update(db, settings).await?;
|
||||
}
|
||||
|
||||
Ok((settings, needs_update))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::storage::types::text_chunk::TextChunk;
|
||||
use crate::storage::indexes::ensure_runtime_indexes;
|
||||
use crate::storage::types::{knowledge_entity::KnowledgeEntity, text_chunk::TextChunk};
|
||||
use async_openai::Client;
|
||||
|
||||
use super::*;
|
||||
use uuid::Uuid;
|
||||
|
||||
async fn get_hnsw_index_dimension(
|
||||
db: &SurrealDbClient,
|
||||
table_name: &str,
|
||||
index_name: &str,
|
||||
) -> u32 {
|
||||
let query = format!("INFO FOR TABLE {table_name};");
|
||||
let mut response = db
|
||||
.client
|
||||
.query(query)
|
||||
.await
|
||||
.expect("Failed to fetch table info");
|
||||
|
||||
let info: surrealdb::Value = response
|
||||
.take(0)
|
||||
.expect("Failed to extract table info response");
|
||||
|
||||
let info_json: serde_json::Value =
|
||||
serde_json::to_value(info).expect("Failed to convert info to json");
|
||||
|
||||
let indexes = info_json["Object"]["indexes"]["Object"]
|
||||
.as_object()
|
||||
.unwrap_or_else(|| panic!("Indexes collection missing in table info: {info_json:#?}"));
|
||||
|
||||
let definition = indexes
|
||||
.get(index_name)
|
||||
.and_then(|definition| definition.get("Strand"))
|
||||
.and_then(|v| v.as_str())
|
||||
.unwrap_or_else(|| panic!("Index definition not found in table info: {info_json:#?}"));
|
||||
|
||||
let dimension_part = definition
|
||||
.split("DIMENSION")
|
||||
.nth(1)
|
||||
.expect("Index definition missing DIMENSION clause");
|
||||
|
||||
let dimension_token = dimension_part
|
||||
.split_whitespace()
|
||||
.next()
|
||||
.expect("Dimension value missing in definition")
|
||||
.trim_end_matches(';');
|
||||
|
||||
dimension_token
|
||||
.parse::<u32>()
|
||||
.expect("Dimension value is not a valid number")
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_settings_initialization() {
|
||||
// Setup in-memory database for testing
|
||||
@@ -212,47 +313,142 @@ mod tests {
|
||||
let initial_chunk = TextChunk::new(
|
||||
"source1".into(),
|
||||
"This chunk has the original dimension".into(),
|
||||
vec![0.1; 1536],
|
||||
"user1".into(),
|
||||
);
|
||||
|
||||
db.store_item(initial_chunk.clone())
|
||||
TextChunk::store_with_embedding(initial_chunk.clone(), vec![0.1; 1536], &db)
|
||||
.await
|
||||
.expect("Failed to store initial chunk");
|
||||
.expect("Failed to store initial chunk with embedding");
|
||||
|
||||
async fn simulate_reembedding(
|
||||
db: &SurrealDbClient,
|
||||
target_dimension: usize,
|
||||
initial_chunk: TextChunk,
|
||||
) {
|
||||
db.query("REMOVE INDEX idx_embedding_chunks ON TABLE text_chunk;")
|
||||
.await
|
||||
.unwrap();
|
||||
db.query(
|
||||
"REMOVE INDEX IF EXISTS idx_embedding_text_chunk_embedding ON TABLE text_chunk_embedding;",
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
let define_index_query = format!(
|
||||
"DEFINE INDEX idx_embedding_chunks ON TABLE text_chunk FIELDS embedding HNSW DIMENSION {};",
|
||||
target_dimension
|
||||
);
|
||||
"DEFINE INDEX idx_embedding_text_chunk_embedding ON TABLE text_chunk_embedding FIELDS embedding HNSW DIMENSION {};",
|
||||
target_dimension
|
||||
);
|
||||
db.query(define_index_query)
|
||||
.await
|
||||
.expect("Re-defining index should succeed");
|
||||
|
||||
let new_embedding = vec![0.5; target_dimension];
|
||||
let sql = "UPDATE type::thing('text_chunk', $id) SET embedding = $embedding;";
|
||||
let sql = "UPSERT type::thing('text_chunk_embedding', $id) SET chunk_id = type::thing('text_chunk', $id), embedding = $embedding, user_id = $user_id;";
|
||||
|
||||
let update_result = db
|
||||
.client
|
||||
.query(sql)
|
||||
.bind(("id", initial_chunk.id.clone()))
|
||||
.bind(("user_id", initial_chunk.user_id.clone()))
|
||||
.bind(("embedding", new_embedding))
|
||||
.await;
|
||||
|
||||
assert!(update_result.is_ok());
|
||||
}
|
||||
|
||||
simulate_reembedding(&db, 768, initial_chunk).await;
|
||||
// Re-embed with the existing configured dimension to ensure migrations remain idempotent.
|
||||
let target_dimension = 1536usize;
|
||||
simulate_reembedding(&db, target_dimension, initial_chunk).await;
|
||||
|
||||
let migration_result = db.apply_migrations().await;
|
||||
|
||||
assert!(migration_result.is_ok(), "Migrations should not fail");
|
||||
assert!(
|
||||
migration_result.is_ok(),
|
||||
"Migrations should not fail: {:?}",
|
||||
migration_result.err()
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_should_change_embedding_length_on_indexes_when_switching_length() {
|
||||
let db = SurrealDbClient::memory("test", &Uuid::new_v4().to_string())
|
||||
.await
|
||||
.expect("Failed to start DB");
|
||||
|
||||
// Apply initial migrations. This sets up the text_chunk index with DIMENSION 1536.
|
||||
db.apply_migrations()
|
||||
.await
|
||||
.expect("Initial migration failed");
|
||||
|
||||
let mut current_settings = SystemSettings::get_current(&db)
|
||||
.await
|
||||
.expect("Failed to load current settings");
|
||||
|
||||
// Ensure runtime indexes exist with the current embedding dimension so INFO queries succeed.
|
||||
ensure_runtime_indexes(&db, current_settings.embedding_dimensions as usize)
|
||||
.await
|
||||
.expect("failed to build runtime indexes");
|
||||
|
||||
let initial_chunk_dimension = get_hnsw_index_dimension(
|
||||
&db,
|
||||
"text_chunk_embedding",
|
||||
"idx_embedding_text_chunk_embedding",
|
||||
)
|
||||
.await;
|
||||
|
||||
assert_eq!(
|
||||
initial_chunk_dimension, current_settings.embedding_dimensions,
|
||||
"embedding size should match initial system settings"
|
||||
);
|
||||
|
||||
let new_dimension = 768;
|
||||
let new_model = "new-test-embedding-model".to_string();
|
||||
|
||||
current_settings.embedding_dimensions = new_dimension;
|
||||
current_settings.embedding_model = new_model.clone();
|
||||
|
||||
let updated_settings = SystemSettings::update(&db, current_settings)
|
||||
.await
|
||||
.expect("Failed to update settings");
|
||||
|
||||
assert_eq!(
|
||||
updated_settings.embedding_dimensions, new_dimension,
|
||||
"Settings should reflect the new embedding dimension"
|
||||
);
|
||||
|
||||
let openai_client = Client::new();
|
||||
|
||||
TextChunk::update_all_embeddings(&db, &openai_client, &new_model, new_dimension)
|
||||
.await
|
||||
.expect("TextChunk re-embedding should succeed on fresh DB");
|
||||
KnowledgeEntity::update_all_embeddings(&db, &openai_client, &new_model, new_dimension)
|
||||
.await
|
||||
.expect("KnowledgeEntity re-embedding should succeed on fresh DB");
|
||||
|
||||
let text_chunk_dimension = get_hnsw_index_dimension(
|
||||
&db,
|
||||
"text_chunk_embedding",
|
||||
"idx_embedding_text_chunk_embedding",
|
||||
)
|
||||
.await;
|
||||
let knowledge_dimension = get_hnsw_index_dimension(
|
||||
&db,
|
||||
"knowledge_entity_embedding",
|
||||
"idx_embedding_knowledge_entity_embedding",
|
||||
)
|
||||
.await;
|
||||
|
||||
assert_eq!(
|
||||
text_chunk_dimension, new_dimension,
|
||||
"text_chunk index dimension should update"
|
||||
);
|
||||
assert_eq!(
|
||||
knowledge_dimension, new_dimension,
|
||||
"knowledge_entity index dimension should update"
|
||||
);
|
||||
|
||||
let persisted_settings = SystemSettings::get_current(&db)
|
||||
.await
|
||||
.expect("Failed to reload updated settings");
|
||||
assert_eq!(
|
||||
persisted_settings.embedding_dimensions, new_dimension,
|
||||
"Settings should persist new embedding dimension"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
411
common/src/storage/types/text_chunk_embedding.rs
Normal file
411
common/src/storage/types/text_chunk_embedding.rs
Normal file
@@ -0,0 +1,411 @@
|
||||
use surrealdb::RecordId;
|
||||
|
||||
use crate::storage::types::text_chunk::TextChunk;
|
||||
use crate::{error::AppError, storage::db::SurrealDbClient, stored_object};
|
||||
|
||||
stored_object!(TextChunkEmbedding, "text_chunk_embedding", {
|
||||
/// Record link to the owning text_chunk
|
||||
chunk_id: RecordId,
|
||||
/// Denormalized source id for bulk deletes
|
||||
source_id: String,
|
||||
/// Embedding vector
|
||||
embedding: Vec<f32>,
|
||||
/// Denormalized user id (for scoping + permissions)
|
||||
user_id: String
|
||||
});
|
||||
|
||||
impl TextChunkEmbedding {
|
||||
/// Recreate the HNSW index with a new embedding dimension.
|
||||
///
|
||||
/// This is useful when the embedding length changes; Surreal requires the
|
||||
/// index definition to be recreated with the updated dimension.
|
||||
pub async fn redefine_hnsw_index(
|
||||
db: &SurrealDbClient,
|
||||
dimension: usize,
|
||||
) -> Result<(), AppError> {
|
||||
let query = format!(
|
||||
"BEGIN TRANSACTION;
|
||||
REMOVE INDEX IF EXISTS idx_embedding_text_chunk_embedding ON TABLE {table};
|
||||
DEFINE INDEX idx_embedding_text_chunk_embedding ON TABLE {table} FIELDS embedding HNSW DIMENSION {dimension};
|
||||
COMMIT TRANSACTION;",
|
||||
table = Self::table_name(),
|
||||
);
|
||||
|
||||
let res = db.client.query(query).await.map_err(AppError::Database)?;
|
||||
res.check().map_err(AppError::Database)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Create a new text chunk embedding
|
||||
///
|
||||
/// `chunk_id` is the **key** part of the text_chunk id (e.g. the UUID),
|
||||
/// not "text_chunk:uuid".
|
||||
pub fn new(chunk_id: &str, source_id: String, embedding: Vec<f32>, user_id: String) -> Self {
|
||||
let now = Utc::now();
|
||||
|
||||
Self {
|
||||
// NOTE: `stored_object!` macro defines `id` as `String`
|
||||
id: uuid::Uuid::new_v4().to_string(),
|
||||
created_at: now,
|
||||
updated_at: now,
|
||||
// Create a record<text_chunk> link: text_chunk:<chunk_id>
|
||||
chunk_id: RecordId::from_table_key(TextChunk::table_name(), chunk_id),
|
||||
source_id,
|
||||
embedding,
|
||||
user_id,
|
||||
}
|
||||
}
|
||||
|
||||
/// Get a single embedding by its chunk RecordId
|
||||
pub async fn get_by_chunk_id(
|
||||
chunk_id: &RecordId,
|
||||
db: &SurrealDbClient,
|
||||
) -> Result<Option<Self>, AppError> {
|
||||
let query = format!(
|
||||
"SELECT * FROM {} WHERE chunk_id = $chunk_id LIMIT 1",
|
||||
Self::table_name()
|
||||
);
|
||||
|
||||
let mut result = db
|
||||
.client
|
||||
.query(query)
|
||||
.bind(("chunk_id", chunk_id.clone()))
|
||||
.await
|
||||
.map_err(AppError::Database)?;
|
||||
|
||||
let embeddings: Vec<Self> = result.take(0).map_err(AppError::Database)?;
|
||||
|
||||
Ok(embeddings.into_iter().next())
|
||||
}
|
||||
|
||||
/// Delete embeddings for a given chunk RecordId
|
||||
pub async fn delete_by_chunk_id(
|
||||
chunk_id: &RecordId,
|
||||
db: &SurrealDbClient,
|
||||
) -> Result<(), AppError> {
|
||||
let query = format!(
|
||||
"DELETE FROM {} WHERE chunk_id = $chunk_id",
|
||||
Self::table_name()
|
||||
);
|
||||
|
||||
db.client
|
||||
.query(query)
|
||||
.bind(("chunk_id", chunk_id.clone()))
|
||||
.await
|
||||
.map_err(AppError::Database)?
|
||||
.check()
|
||||
.map_err(AppError::Database)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Delete all embeddings that belong to chunks with a given `source_id`
|
||||
///
|
||||
/// This uses the denormalized `source_id` on the embedding table.
|
||||
pub async fn delete_by_source_id(
|
||||
source_id: &str,
|
||||
db: &SurrealDbClient,
|
||||
) -> Result<(), AppError> {
|
||||
let query = format!(
|
||||
"DELETE FROM {} WHERE source_id = $source_id",
|
||||
Self::table_name()
|
||||
);
|
||||
|
||||
db.client
|
||||
.query(query)
|
||||
.bind(("source_id", source_id.to_owned()))
|
||||
.await
|
||||
.map_err(AppError::Database)?
|
||||
.check()
|
||||
.map_err(AppError::Database)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::storage::db::SurrealDbClient;
|
||||
use surrealdb::Value as SurrealValue;
|
||||
use uuid::Uuid;
|
||||
|
||||
/// Helper to create an in-memory DB and apply migrations
|
||||
async fn setup_test_db() -> SurrealDbClient {
|
||||
let namespace = "test_ns";
|
||||
let database = Uuid::new_v4().to_string();
|
||||
let db = SurrealDbClient::memory(namespace, &database)
|
||||
.await
|
||||
.expect("Failed to start in-memory surrealdb");
|
||||
|
||||
db.apply_migrations()
|
||||
.await
|
||||
.expect("Failed to apply migrations");
|
||||
|
||||
db
|
||||
}
|
||||
|
||||
/// Helper: create a text_chunk with a known key, return its RecordId
|
||||
async fn create_text_chunk_with_id(
|
||||
db: &SurrealDbClient,
|
||||
key: &str,
|
||||
source_id: &str,
|
||||
user_id: &str,
|
||||
) -> RecordId {
|
||||
let chunk = TextChunk {
|
||||
id: key.to_owned(),
|
||||
created_at: Utc::now(),
|
||||
updated_at: Utc::now(),
|
||||
source_id: source_id.to_owned(),
|
||||
chunk: "Some test chunk text".to_owned(),
|
||||
user_id: user_id.to_owned(),
|
||||
};
|
||||
|
||||
db.store_item(chunk)
|
||||
.await
|
||||
.expect("Failed to create text_chunk");
|
||||
|
||||
RecordId::from_table_key(TextChunk::table_name(), key)
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_create_and_get_by_chunk_id() {
|
||||
let db = setup_test_db().await;
|
||||
|
||||
let user_id = "user_a";
|
||||
let chunk_key = "chunk-123";
|
||||
let source_id = "source-1";
|
||||
|
||||
// 1) Create a text_chunk with a known key
|
||||
let chunk_rid = create_text_chunk_with_id(&db, chunk_key, source_id, user_id).await;
|
||||
|
||||
// 2) Create and store an embedding for that chunk
|
||||
let embedding_vec = vec![0.1_f32, 0.2, 0.3];
|
||||
let emb = TextChunkEmbedding::new(
|
||||
chunk_key,
|
||||
source_id.to_string(),
|
||||
embedding_vec.clone(),
|
||||
user_id.to_string(),
|
||||
);
|
||||
|
||||
TextChunkEmbedding::redefine_hnsw_index(&db, emb.embedding.len())
|
||||
.await
|
||||
.expect("Failed to redefine index length");
|
||||
|
||||
let _: Option<TextChunkEmbedding> = db
|
||||
.client
|
||||
.create(TextChunkEmbedding::table_name())
|
||||
.content(emb)
|
||||
.await
|
||||
.expect("Failed to store embedding")
|
||||
.take()
|
||||
.expect("Failed to deserialize stored embedding");
|
||||
|
||||
// 3) Fetch it via get_by_chunk_id
|
||||
let fetched = TextChunkEmbedding::get_by_chunk_id(&chunk_rid, &db)
|
||||
.await
|
||||
.expect("Failed to get embedding by chunk_id");
|
||||
|
||||
assert!(fetched.is_some(), "Expected an embedding to be found");
|
||||
let fetched = fetched.unwrap();
|
||||
|
||||
assert_eq!(fetched.user_id, user_id);
|
||||
assert_eq!(fetched.chunk_id, chunk_rid);
|
||||
assert_eq!(fetched.embedding, embedding_vec);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_delete_by_chunk_id() {
|
||||
let db = setup_test_db().await;
|
||||
|
||||
let user_id = "user_b";
|
||||
let chunk_key = "chunk-delete";
|
||||
let source_id = "source-del";
|
||||
|
||||
let chunk_rid = create_text_chunk_with_id(&db, chunk_key, source_id, user_id).await;
|
||||
|
||||
let emb = TextChunkEmbedding::new(
|
||||
chunk_key,
|
||||
source_id.to_string(),
|
||||
vec![0.4_f32, 0.5, 0.6],
|
||||
user_id.to_string(),
|
||||
);
|
||||
|
||||
TextChunkEmbedding::redefine_hnsw_index(&db, emb.embedding.len())
|
||||
.await
|
||||
.expect("Failed to redefine index length");
|
||||
|
||||
let _: Option<TextChunkEmbedding> = db
|
||||
.client
|
||||
.create(TextChunkEmbedding::table_name())
|
||||
.content(emb)
|
||||
.await
|
||||
.expect("Failed to store embedding")
|
||||
.take()
|
||||
.expect("Failed to deserialize stored embedding");
|
||||
|
||||
// Ensure it exists
|
||||
let existing = TextChunkEmbedding::get_by_chunk_id(&chunk_rid, &db)
|
||||
.await
|
||||
.expect("Failed to get embedding before delete");
|
||||
assert!(existing.is_some(), "Embedding should exist before delete");
|
||||
|
||||
// Delete by chunk_id
|
||||
TextChunkEmbedding::delete_by_chunk_id(&chunk_rid, &db)
|
||||
.await
|
||||
.expect("Failed to delete by chunk_id");
|
||||
|
||||
// Ensure it no longer exists
|
||||
let after = TextChunkEmbedding::get_by_chunk_id(&chunk_rid, &db)
|
||||
.await
|
||||
.expect("Failed to get embedding after delete");
|
||||
assert!(after.is_none(), "Embedding should have been deleted");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_delete_by_source_id() {
|
||||
let db = setup_test_db().await;
|
||||
|
||||
let user_id = "user_c";
|
||||
let source_id = "shared-source";
|
||||
let other_source = "other-source";
|
||||
|
||||
// Two chunks with the same source_id
|
||||
let chunk1_rid = create_text_chunk_with_id(&db, "chunk-s1", source_id, user_id).await;
|
||||
let chunk2_rid = create_text_chunk_with_id(&db, "chunk-s2", source_id, user_id).await;
|
||||
|
||||
// One chunk with a different source_id
|
||||
let chunk_other_rid =
|
||||
create_text_chunk_with_id(&db, "chunk-other", other_source, user_id).await;
|
||||
|
||||
// Create embeddings for all three
|
||||
let emb1 = TextChunkEmbedding::new(
|
||||
"chunk-s1",
|
||||
source_id.to_string(),
|
||||
vec![0.1],
|
||||
user_id.to_string(),
|
||||
);
|
||||
let emb2 = TextChunkEmbedding::new(
|
||||
"chunk-s2",
|
||||
source_id.to_string(),
|
||||
vec![0.2],
|
||||
user_id.to_string(),
|
||||
);
|
||||
let emb3 = TextChunkEmbedding::new(
|
||||
"chunk-other",
|
||||
other_source.to_string(),
|
||||
vec![0.3],
|
||||
user_id.to_string(),
|
||||
);
|
||||
|
||||
// Update length on index
|
||||
TextChunkEmbedding::redefine_hnsw_index(&db, emb1.embedding.len())
|
||||
.await
|
||||
.expect("Failed to redefine index length");
|
||||
|
||||
for emb in [emb1, emb2, emb3] {
|
||||
let _: Option<TextChunkEmbedding> = db
|
||||
.client
|
||||
.create(TextChunkEmbedding::table_name())
|
||||
.content(emb)
|
||||
.await
|
||||
.expect("Failed to store embedding")
|
||||
.take()
|
||||
.expect("Failed to deserialize stored embedding");
|
||||
}
|
||||
|
||||
// Sanity check: they all exist
|
||||
assert!(TextChunkEmbedding::get_by_chunk_id(&chunk1_rid, &db)
|
||||
.await
|
||||
.unwrap()
|
||||
.is_some());
|
||||
assert!(TextChunkEmbedding::get_by_chunk_id(&chunk2_rid, &db)
|
||||
.await
|
||||
.unwrap()
|
||||
.is_some());
|
||||
assert!(TextChunkEmbedding::get_by_chunk_id(&chunk_other_rid, &db)
|
||||
.await
|
||||
.unwrap()
|
||||
.is_some());
|
||||
|
||||
// Delete embeddings by source_id (shared-source)
|
||||
TextChunkEmbedding::delete_by_source_id(source_id, &db)
|
||||
.await
|
||||
.expect("Failed to delete by source_id");
|
||||
|
||||
// Chunks from shared-source should have no embeddings
|
||||
assert!(TextChunkEmbedding::get_by_chunk_id(&chunk1_rid, &db)
|
||||
.await
|
||||
.unwrap()
|
||||
.is_none());
|
||||
assert!(TextChunkEmbedding::get_by_chunk_id(&chunk2_rid, &db)
|
||||
.await
|
||||
.unwrap()
|
||||
.is_none());
|
||||
|
||||
// The other chunk should still have its embedding
|
||||
assert!(TextChunkEmbedding::get_by_chunk_id(&chunk_other_rid, &db)
|
||||
.await
|
||||
.unwrap()
|
||||
.is_some());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_redefine_hnsw_index_updates_dimension() {
|
||||
let db = setup_test_db().await;
|
||||
|
||||
// Change the index dimension from default (1536) to a smaller test value.
|
||||
TextChunkEmbedding::redefine_hnsw_index(&db, 8)
|
||||
.await
|
||||
.expect("failed to redefine index");
|
||||
|
||||
let mut info_res = db
|
||||
.client
|
||||
.query("INFO FOR TABLE text_chunk_embedding;")
|
||||
.await
|
||||
.expect("info query failed");
|
||||
let info: SurrealValue = info_res.take(0).expect("failed to take info result");
|
||||
let info_json: serde_json::Value =
|
||||
serde_json::to_value(info).expect("failed to convert info to json");
|
||||
let idx_sql = info_json["Object"]["indexes"]["Object"]
|
||||
["idx_embedding_text_chunk_embedding"]["Strand"]
|
||||
.as_str()
|
||||
.unwrap_or_default();
|
||||
|
||||
assert!(
|
||||
idx_sql.contains("DIMENSION 8"),
|
||||
"expected index definition to contain new dimension, got: {idx_sql}"
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_redefine_hnsw_index_is_idempotent() {
|
||||
let db = setup_test_db().await;
|
||||
|
||||
TextChunkEmbedding::redefine_hnsw_index(&db, 4)
|
||||
.await
|
||||
.expect("first redefine failed");
|
||||
TextChunkEmbedding::redefine_hnsw_index(&db, 4)
|
||||
.await
|
||||
.expect("second redefine failed");
|
||||
|
||||
let mut info_res = db
|
||||
.client
|
||||
.query("INFO FOR TABLE text_chunk_embedding;")
|
||||
.await
|
||||
.expect("info query failed");
|
||||
let info: SurrealValue = info_res.take(0).expect("failed to take info result");
|
||||
let info_json: serde_json::Value =
|
||||
serde_json::to_value(info).expect("failed to convert info to json");
|
||||
let idx_sql = info_json["Object"]["indexes"]["Object"]
|
||||
["idx_embedding_text_chunk_embedding"]["Strand"]
|
||||
.as_str()
|
||||
.unwrap_or_default();
|
||||
|
||||
assert!(
|
||||
idx_sql.contains("DIMENSION 4"),
|
||||
"expected index definition to retain dimension 4, got: {idx_sql}"
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -5,6 +5,7 @@ use crate::{error::AppError, storage::db::SurrealDbClient, stored_object};
|
||||
|
||||
use super::file_info::FileInfo;
|
||||
|
||||
#[allow(clippy::module_name_repetitions)]
|
||||
#[derive(Debug, Deserialize, Serialize)]
|
||||
pub struct TextContentSearchResult {
|
||||
#[serde(deserialize_with = "deserialize_flexible_id")]
|
||||
@@ -50,8 +51,11 @@ pub struct TextContentSearchResult {
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)]
|
||||
pub struct UrlInfo {
|
||||
#[serde(default)]
|
||||
pub url: String,
|
||||
#[serde(default)]
|
||||
pub title: String,
|
||||
#[serde(default)]
|
||||
pub image_id: String,
|
||||
}
|
||||
|
||||
@@ -110,6 +114,26 @@ impl TextContent {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn has_other_with_file(
|
||||
file_id: &str,
|
||||
exclude_id: &str,
|
||||
db: &SurrealDbClient,
|
||||
) -> Result<bool, AppError> {
|
||||
let mut response = db
|
||||
.client
|
||||
.query(
|
||||
"SELECT VALUE id FROM type::table($table_name) WHERE file_info.id = $file_id AND id != type::thing($table_name, $exclude_id) LIMIT 1",
|
||||
)
|
||||
.bind(("table_name", TextContent::table_name()))
|
||||
.bind(("file_id", file_id.to_owned()))
|
||||
.bind(("exclude_id", exclude_id.to_owned()))
|
||||
.await?;
|
||||
|
||||
let existing: Option<surrealdb::sql::Thing> = response.take(0)?;
|
||||
|
||||
Ok(existing.is_some())
|
||||
}
|
||||
|
||||
pub async fn search(
|
||||
db: &SurrealDbClient,
|
||||
search_terms: &str,
|
||||
@@ -126,12 +150,12 @@ impl TextContent {
|
||||
search::highlight('<b>', '</b>', 4) AS highlighted_url,
|
||||
search::highlight('<b>', '</b>', 5) AS highlighted_url_title,
|
||||
(
|
||||
search::score(0) +
|
||||
search::score(1) +
|
||||
search::score(2) +
|
||||
search::score(3) +
|
||||
search::score(4) +
|
||||
search::score(5)
|
||||
IF search::score(0) != NONE THEN search::score(0) ELSE 0 END +
|
||||
IF search::score(1) != NONE THEN search::score(1) ELSE 0 END +
|
||||
IF search::score(2) != NONE THEN search::score(2) ELSE 0 END +
|
||||
IF search::score(3) != NONE THEN search::score(3) ELSE 0 END +
|
||||
IF search::score(4) != NONE THEN search::score(4) ELSE 0 END +
|
||||
IF search::score(5) != NONE THEN search::score(5) ELSE 0 END
|
||||
) AS score
|
||||
FROM text_content
|
||||
WHERE
|
||||
@@ -276,4 +300,64 @@ mod tests {
|
||||
assert_eq!(updated_content.text, new_text);
|
||||
assert!(updated_content.updated_at > text_content.updated_at);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_has_other_with_file_detects_shared_usage() {
|
||||
let namespace = "test_ns";
|
||||
let database = &Uuid::new_v4().to_string();
|
||||
let db = SurrealDbClient::memory(namespace, database)
|
||||
.await
|
||||
.expect("Failed to start in-memory surrealdb");
|
||||
|
||||
let user_id = "user123".to_string();
|
||||
let file_info = FileInfo {
|
||||
id: "file-1".to_string(),
|
||||
created_at: chrono::Utc::now(),
|
||||
updated_at: chrono::Utc::now(),
|
||||
sha256: "sha-test".to_string(),
|
||||
path: "user123/file-1/test.txt".to_string(),
|
||||
file_name: "test.txt".to_string(),
|
||||
mime_type: "text/plain".to_string(),
|
||||
user_id: user_id.clone(),
|
||||
};
|
||||
|
||||
let content_a = TextContent::new(
|
||||
"First".to_string(),
|
||||
Some("ctx-a".to_string()),
|
||||
"category".to_string(),
|
||||
Some(file_info.clone()),
|
||||
None,
|
||||
user_id.clone(),
|
||||
);
|
||||
let content_b = TextContent::new(
|
||||
"Second".to_string(),
|
||||
Some("ctx-b".to_string()),
|
||||
"category".to_string(),
|
||||
Some(file_info.clone()),
|
||||
None,
|
||||
user_id.clone(),
|
||||
);
|
||||
|
||||
db.store_item(content_a.clone())
|
||||
.await
|
||||
.expect("Failed to store first content");
|
||||
db.store_item(content_b.clone())
|
||||
.await
|
||||
.expect("Failed to store second content");
|
||||
|
||||
let has_other = TextContent::has_other_with_file(&file_info.id, &content_a.id, &db)
|
||||
.await
|
||||
.expect("Failed to check for shared file usage");
|
||||
assert!(has_other);
|
||||
|
||||
let _removed: Option<TextContent> = db
|
||||
.delete_item(&content_b.id)
|
||||
.await
|
||||
.expect("Failed to delete second content");
|
||||
|
||||
let has_other_after = TextContent::has_other_with_file(&file_info.id, &content_a.id, &db)
|
||||
.await
|
||||
.expect("Failed to check shared usage after delete");
|
||||
assert!(!has_other_after);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
use crate::{error::AppError, storage::db::SurrealDbClient, stored_object};
|
||||
use anyhow::anyhow;
|
||||
use async_trait::async_trait;
|
||||
use axum_session_auth::Authentication;
|
||||
use chrono_tz::Tz;
|
||||
@@ -8,7 +9,7 @@ use uuid::Uuid;
|
||||
use super::text_chunk::TextChunk;
|
||||
use super::{
|
||||
conversation::Conversation,
|
||||
ingestion_task::{IngestionTask, MAX_ATTEMPTS},
|
||||
ingestion_task::{IngestionTask, TaskState},
|
||||
knowledge_entity::{KnowledgeEntity, KnowledgeEntityType},
|
||||
knowledge_relationship::KnowledgeRelationship,
|
||||
system_settings::SystemSettings,
|
||||
@@ -17,29 +18,95 @@ use super::{
|
||||
use chrono::Duration;
|
||||
use futures::try_join;
|
||||
|
||||
/// Result row for returning user category.
|
||||
#[derive(Deserialize)]
|
||||
pub struct CategoryResponse {
|
||||
/// Category name tied to the user.
|
||||
category: String,
|
||||
}
|
||||
|
||||
stored_object!(User, "user", {
|
||||
use std::str::FromStr;
|
||||
|
||||
/// Supported UI themes.
|
||||
#[derive(Serialize, Deserialize, Debug, Clone, Copy, PartialEq, Eq, Default)]
|
||||
#[serde(rename_all = "kebab-case")]
|
||||
pub enum Theme {
|
||||
Light,
|
||||
Dark,
|
||||
WarmPaper,
|
||||
ObsidianPrism,
|
||||
#[default]
|
||||
System,
|
||||
}
|
||||
|
||||
impl FromStr for Theme {
|
||||
type Err = ();
|
||||
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
match s {
|
||||
"light" => Ok(Self::Light),
|
||||
"dark" => Ok(Self::Dark),
|
||||
"warm-paper" => Ok(Self::WarmPaper),
|
||||
"obsidian-prism" => Ok(Self::ObsidianPrism),
|
||||
"system" => Ok(Self::System),
|
||||
_ => Err(()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Theme {
|
||||
pub fn as_str(&self) -> &'static str {
|
||||
match self {
|
||||
Self::Light => "light",
|
||||
Self::Dark => "dark",
|
||||
Self::WarmPaper => "warm-paper",
|
||||
Self::ObsidianPrism => "obsidian-prism",
|
||||
Self::System => "system",
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the theme that should be initially applied.
|
||||
/// For "system", defaults to "light".
|
||||
pub fn initial_theme(&self) -> &'static str {
|
||||
match self {
|
||||
Self::System => "light",
|
||||
other => other.as_str(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
stored_object!(
|
||||
#[allow(clippy::unsafe_derive_deserialize)]
|
||||
User, "user", {
|
||||
email: String,
|
||||
password: String,
|
||||
anonymous: bool,
|
||||
api_key: Option<String>,
|
||||
admin: bool,
|
||||
#[serde(default)]
|
||||
timezone: String
|
||||
timezone: String,
|
||||
#[serde(default, deserialize_with = "deserialize_theme_or_default")]
|
||||
theme: Theme
|
||||
});
|
||||
|
||||
fn deserialize_theme_or_default<'de, D>(deserializer: D) -> Result<Theme, D::Error>
|
||||
where
|
||||
D: serde::Deserializer<'de>,
|
||||
{
|
||||
let raw = Option::<String>::deserialize(deserializer)?;
|
||||
Ok(raw
|
||||
.and_then(|value| Theme::from_str(value.as_str()).ok())
|
||||
.unwrap_or_default())
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl Authentication<User, String, Surreal<Any>> for User {
|
||||
async fn load_user(userid: String, db: Option<&Surreal<Any>>) -> Result<User, anyhow::Error> {
|
||||
let db = db.unwrap();
|
||||
let db = db.ok_or_else(|| anyhow!("Database handle missing"))?;
|
||||
Ok(db
|
||||
.select((Self::table_name(), userid.as_str()))
|
||||
.await?
|
||||
.unwrap())
|
||||
.ok_or_else(|| anyhow!("User {userid} not found"))?)
|
||||
}
|
||||
|
||||
fn is_authenticated(&self) -> bool {
|
||||
@@ -55,14 +122,19 @@ impl Authentication<User, String, Surreal<Any>> for User {
|
||||
}
|
||||
}
|
||||
|
||||
/// Ensures a timezone string parses, defaulting to UTC when invalid.
|
||||
fn validate_timezone(input: &str) -> String {
|
||||
match input.parse::<Tz>() {
|
||||
Ok(_) => input.to_owned(),
|
||||
Err(_) => {
|
||||
tracing::warn!("Invalid timezone '{}' received, defaulting to UTC", input);
|
||||
"UTC".to_owned()
|
||||
}
|
||||
if input.parse::<Tz>().is_ok() {
|
||||
return input.to_owned();
|
||||
}
|
||||
|
||||
tracing::warn!("Invalid timezone '{}' received, defaulting to UTC", input);
|
||||
"UTC".to_owned()
|
||||
}
|
||||
|
||||
/// Ensures a theme string is valid, defaulting to "system" when invalid.
|
||||
fn validate_theme(input: &str) -> Theme {
|
||||
Theme::from_str(input).unwrap_or_default()
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug, Clone)]
|
||||
@@ -77,12 +149,15 @@ pub struct DashboardStats {
|
||||
pub new_text_chunks_week: i64,
|
||||
}
|
||||
|
||||
/// Helper for aggregating `SurrealDB` count responses.
|
||||
#[derive(Deserialize)]
|
||||
struct CountResult {
|
||||
/// Row count returned by the query.
|
||||
count: i64,
|
||||
}
|
||||
|
||||
impl User {
|
||||
/// Counts all objects of a given type belonging to the user.
|
||||
async fn count_total<T: crate::storage::types::StoredObject>(
|
||||
db: &SurrealDbClient,
|
||||
user_id: &str,
|
||||
@@ -94,9 +169,10 @@ impl User {
|
||||
.bind(("user_id", user_id.to_string()))
|
||||
.await?
|
||||
.take(0)?;
|
||||
Ok(result.map(|r| r.count).unwrap_or(0))
|
||||
Ok(result.map_or(0, |r| r.count))
|
||||
}
|
||||
|
||||
/// Counts objects of a given type created after a specific timestamp.
|
||||
async fn count_since<T: crate::storage::types::StoredObject>(
|
||||
db: &SurrealDbClient,
|
||||
user_id: &str,
|
||||
@@ -109,17 +185,19 @@ impl User {
|
||||
)
|
||||
.bind(("table", T::table_name()))
|
||||
.bind(("user_id", user_id.to_string()))
|
||||
.bind(("since", since))
|
||||
.bind(("since", surrealdb::Datetime::from(since)))
|
||||
.await?
|
||||
.take(0)?;
|
||||
Ok(result.map(|r| r.count).unwrap_or(0))
|
||||
Ok(result.map_or(0, |r| r.count))
|
||||
}
|
||||
|
||||
pub async fn get_dashboard_stats(
|
||||
user_id: &str,
|
||||
db: &SurrealDbClient,
|
||||
) -> Result<DashboardStats, AppError> {
|
||||
let since = chrono::Utc::now() - Duration::days(7);
|
||||
let since = chrono::Utc::now()
|
||||
.checked_sub_signed(Duration::days(7))
|
||||
.unwrap_or_else(chrono::Utc::now);
|
||||
|
||||
let (
|
||||
total_documents,
|
||||
@@ -157,6 +235,7 @@ impl User {
|
||||
password: String,
|
||||
db: &SurrealDbClient,
|
||||
timezone: String,
|
||||
theme: String,
|
||||
) -> Result<Self, AppError> {
|
||||
// verify that the application allows new creations
|
||||
let systemsettings = SystemSettings::get_current(db).await?;
|
||||
@@ -165,10 +244,11 @@ impl User {
|
||||
}
|
||||
|
||||
let validated_tz = validate_timezone(&timezone);
|
||||
let validated_theme = validate_theme(&theme);
|
||||
let now = Utc::now();
|
||||
let id = Uuid::new_v4().to_string();
|
||||
|
||||
let user: Option<User> = db
|
||||
let user: Option<Self> = db
|
||||
.client
|
||||
.query(
|
||||
"LET $count = (SELECT count() FROM type::table($table))[0].count;
|
||||
@@ -179,7 +259,8 @@ impl User {
|
||||
anonymous = false,
|
||||
created_at = $created_at,
|
||||
updated_at = $updated_at,
|
||||
timezone = $timezone",
|
||||
timezone = $timezone,
|
||||
theme = $theme",
|
||||
)
|
||||
.bind(("table", "user"))
|
||||
.bind(("id", id))
|
||||
@@ -188,6 +269,7 @@ impl User {
|
||||
.bind(("created_at", surrealdb::Datetime::from(now)))
|
||||
.bind(("updated_at", surrealdb::Datetime::from(now)))
|
||||
.bind(("timezone", validated_tz))
|
||||
.bind(("theme", validated_theme.as_str()))
|
||||
.await?
|
||||
.take(1)?;
|
||||
|
||||
@@ -217,7 +299,7 @@ impl User {
|
||||
password: &str,
|
||||
db: &SurrealDbClient,
|
||||
) -> Result<Self, AppError> {
|
||||
let user: Option<User> = db
|
||||
let user: Option<Self> = db
|
||||
.client
|
||||
.query(
|
||||
"SELECT * FROM user
|
||||
@@ -235,7 +317,7 @@ impl User {
|
||||
email: &str,
|
||||
db: &SurrealDbClient,
|
||||
) -> Result<Option<Self>, AppError> {
|
||||
let user: Option<User> = db
|
||||
let user: Option<Self> = db
|
||||
.client
|
||||
.query("SELECT * FROM user WHERE email = $email LIMIT 1")
|
||||
.bind(("email", email.to_string()))
|
||||
@@ -249,7 +331,7 @@ impl User {
|
||||
api_key: &str,
|
||||
db: &SurrealDbClient,
|
||||
) -> Result<Option<Self>, AppError> {
|
||||
let user: Option<User> = db
|
||||
let user: Option<Self> = db
|
||||
.client
|
||||
.query("SELECT * FROM user WHERE api_key = $api_key LIMIT 1")
|
||||
.bind(("api_key", api_key.to_string()))
|
||||
@@ -261,10 +343,10 @@ impl User {
|
||||
|
||||
pub async fn set_api_key(id: &str, db: &SurrealDbClient) -> Result<String, AppError> {
|
||||
// Generate a secure random API key
|
||||
let api_key = format!("sk_{}", Uuid::new_v4().to_string().replace("-", ""));
|
||||
let api_key = format!("sk_{}", Uuid::new_v4().to_string().replace('-', ""));
|
||||
|
||||
// Update the user record with the new API key
|
||||
let user: Option<User> = db
|
||||
let user: Option<Self> = db
|
||||
.client
|
||||
.query(
|
||||
"UPDATE type::thing('user', $id)
|
||||
@@ -285,7 +367,7 @@ impl User {
|
||||
}
|
||||
|
||||
pub async fn revoke_api_key(id: &str, db: &SurrealDbClient) -> Result<(), AppError> {
|
||||
let user: Option<User> = db
|
||||
let user: Option<Self> = db
|
||||
.client
|
||||
.query(
|
||||
"UPDATE type::thing('user', $id)
|
||||
@@ -341,6 +423,7 @@ impl User {
|
||||
) -> Result<Vec<String>, AppError> {
|
||||
#[derive(Deserialize)]
|
||||
struct EntityTypeResponse {
|
||||
/// Raw entity type value from the database.
|
||||
entity_type: String,
|
||||
}
|
||||
|
||||
@@ -357,8 +440,8 @@ impl User {
|
||||
let entity_types: Vec<String> = response
|
||||
.into_iter()
|
||||
.map(|item| {
|
||||
let normalized = KnowledgeEntityType::from(item.entity_type.clone());
|
||||
format!("{:?}", normalized)
|
||||
let normalized = KnowledgeEntityType::from(item.entity_type);
|
||||
format!("{normalized:?}")
|
||||
})
|
||||
.collect();
|
||||
|
||||
@@ -449,13 +532,26 @@ impl User {
|
||||
db: &SurrealDbClient,
|
||||
) -> Result<(), AppError> {
|
||||
db.query("UPDATE type::thing('user', $user_id) SET timezone = $timezone")
|
||||
.bind(("table_name", User::table_name()))
|
||||
.bind(("table_name", Self::table_name()))
|
||||
.bind(("user_id", user_id.to_string()))
|
||||
.bind(("timezone", timezone.to_string()))
|
||||
.await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn update_theme(
|
||||
user_id: &str,
|
||||
theme: &str,
|
||||
db: &SurrealDbClient,
|
||||
) -> Result<(), AppError> {
|
||||
let validated_theme = validate_theme(theme);
|
||||
db.query("UPDATE type::thing('user', $user_id) SET theme = $theme")
|
||||
.bind(("user_id", user_id.to_string()))
|
||||
.bind(("theme", validated_theme.as_str()))
|
||||
.await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn get_user_categories(
|
||||
user_id: &str,
|
||||
db: &SurrealDbClient,
|
||||
@@ -535,19 +631,43 @@ impl User {
|
||||
let jobs: Vec<IngestionTask> = db
|
||||
.query(
|
||||
"SELECT * FROM type::table($table)
|
||||
WHERE user_id = $user_id
|
||||
AND (
|
||||
status.name = 'Created'
|
||||
OR (
|
||||
status.name = 'InProgress'
|
||||
AND status.attempts < $max_attempts
|
||||
)
|
||||
)
|
||||
ORDER BY created_at DESC",
|
||||
WHERE user_id = $user_id
|
||||
AND (
|
||||
state IN $active_states
|
||||
OR (state = $failed_state AND attempts < max_attempts)
|
||||
)
|
||||
ORDER BY scheduled_at ASC, created_at DESC",
|
||||
)
|
||||
.bind(("table", IngestionTask::table_name()))
|
||||
.bind(("user_id", user_id.to_owned()))
|
||||
.bind((
|
||||
"active_states",
|
||||
vec![
|
||||
TaskState::Pending.as_str(),
|
||||
TaskState::Reserved.as_str(),
|
||||
TaskState::Processing.as_str(),
|
||||
],
|
||||
))
|
||||
.bind(("failed_state", TaskState::Failed.as_str()))
|
||||
.await?
|
||||
.take(0)?;
|
||||
|
||||
Ok(jobs)
|
||||
}
|
||||
|
||||
/// Gets all ingestion tasks for the specified user ordered by newest first
|
||||
pub async fn get_all_ingestion_tasks(
|
||||
user_id: &str,
|
||||
db: &SurrealDbClient,
|
||||
) -> Result<Vec<IngestionTask>, AppError> {
|
||||
let jobs: Vec<IngestionTask> = db
|
||||
.query(
|
||||
"SELECT * FROM type::table($table)
|
||||
WHERE user_id = $user_id
|
||||
ORDER BY created_at DESC",
|
||||
)
|
||||
.bind(("table", IngestionTask::table_name()))
|
||||
.bind(("user_id", user_id.to_owned()))
|
||||
.bind(("max_attempts", MAX_ATTEMPTS))
|
||||
.await?
|
||||
.take(0)?;
|
||||
|
||||
@@ -605,7 +725,7 @@ impl User {
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::storage::types::ingestion_payload::IngestionPayload;
|
||||
use crate::storage::types::ingestion_task::{IngestionTask, IngestionTaskStatus, MAX_ATTEMPTS};
|
||||
use crate::storage::types::ingestion_task::{IngestionTask, TaskState, MAX_ATTEMPTS};
|
||||
use std::collections::HashSet;
|
||||
|
||||
// Helper function to set up a test database with SystemSettings
|
||||
@@ -638,6 +758,7 @@ mod tests {
|
||||
password.to_string(),
|
||||
&db,
|
||||
timezone.to_string(),
|
||||
"system".to_string(),
|
||||
)
|
||||
.await
|
||||
.expect("Failed to create user");
|
||||
@@ -675,6 +796,7 @@ mod tests {
|
||||
password.to_string(),
|
||||
&db,
|
||||
"UTC".to_string(),
|
||||
"system".to_string(),
|
||||
)
|
||||
.await
|
||||
.expect("Failed to create user");
|
||||
@@ -705,33 +827,36 @@ mod tests {
|
||||
user_id: user_id.to_string(),
|
||||
};
|
||||
|
||||
let created_task = IngestionTask::new(payload.clone(), user_id.to_string()).await;
|
||||
let created_task = IngestionTask::new(payload.clone(), user_id.to_string());
|
||||
db.store_item(created_task.clone())
|
||||
.await
|
||||
.expect("Failed to store created task");
|
||||
|
||||
let mut in_progress_allowed =
|
||||
IngestionTask::new(payload.clone(), user_id.to_string()).await;
|
||||
in_progress_allowed.status = IngestionTaskStatus::InProgress {
|
||||
attempts: 1,
|
||||
last_attempt: chrono::Utc::now(),
|
||||
};
|
||||
db.store_item(in_progress_allowed.clone())
|
||||
let mut processing_task = IngestionTask::new(payload.clone(), user_id.to_string());
|
||||
processing_task.state = TaskState::Processing;
|
||||
processing_task.attempts = 1;
|
||||
db.store_item(processing_task.clone())
|
||||
.await
|
||||
.expect("Failed to store in-progress task");
|
||||
.expect("Failed to store processing task");
|
||||
|
||||
let mut in_progress_blocked =
|
||||
IngestionTask::new(payload.clone(), user_id.to_string()).await;
|
||||
in_progress_blocked.status = IngestionTaskStatus::InProgress {
|
||||
attempts: MAX_ATTEMPTS,
|
||||
last_attempt: chrono::Utc::now(),
|
||||
};
|
||||
db.store_item(in_progress_blocked.clone())
|
||||
let mut failed_retry_task = IngestionTask::new(payload.clone(), user_id.to_string());
|
||||
failed_retry_task.state = TaskState::Failed;
|
||||
failed_retry_task.attempts = 1;
|
||||
failed_retry_task.scheduled_at = chrono::Utc::now() - chrono::Duration::minutes(5);
|
||||
db.store_item(failed_retry_task.clone())
|
||||
.await
|
||||
.expect("Failed to store retryable failed task");
|
||||
|
||||
let mut failed_blocked_task = IngestionTask::new(payload.clone(), user_id.to_string());
|
||||
failed_blocked_task.state = TaskState::Failed;
|
||||
failed_blocked_task.attempts = MAX_ATTEMPTS;
|
||||
failed_blocked_task.error_message = Some("Too many failures".into());
|
||||
db.store_item(failed_blocked_task.clone())
|
||||
.await
|
||||
.expect("Failed to store blocked task");
|
||||
|
||||
let mut completed_task = IngestionTask::new(payload.clone(), user_id.to_string()).await;
|
||||
completed_task.status = IngestionTaskStatus::Completed;
|
||||
let mut completed_task = IngestionTask::new(payload.clone(), user_id.to_string());
|
||||
completed_task.state = TaskState::Succeeded;
|
||||
db.store_item(completed_task.clone())
|
||||
.await
|
||||
.expect("Failed to store completed task");
|
||||
@@ -742,7 +867,7 @@ mod tests {
|
||||
category: "Category".to_string(),
|
||||
user_id: other_user_id.to_string(),
|
||||
};
|
||||
let other_task = IngestionTask::new(other_payload, other_user_id.to_string()).await;
|
||||
let other_task = IngestionTask::new(other_payload, other_user_id.to_string());
|
||||
db.store_item(other_task)
|
||||
.await
|
||||
.expect("Failed to store other user task");
|
||||
@@ -755,10 +880,54 @@ mod tests {
|
||||
unfinished.iter().map(|task| task.id.clone()).collect();
|
||||
|
||||
assert!(unfinished_ids.contains(&created_task.id));
|
||||
assert!(unfinished_ids.contains(&in_progress_allowed.id));
|
||||
assert!(!unfinished_ids.contains(&in_progress_blocked.id));
|
||||
assert!(unfinished_ids.contains(&processing_task.id));
|
||||
assert!(unfinished_ids.contains(&failed_retry_task.id));
|
||||
assert!(!unfinished_ids.contains(&failed_blocked_task.id));
|
||||
assert!(!unfinished_ids.contains(&completed_task.id));
|
||||
assert_eq!(unfinished_ids.len(), 2);
|
||||
assert_eq!(unfinished_ids.len(), 3);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_get_all_ingestion_tasks_returns_sorted() {
|
||||
let db = setup_test_db().await;
|
||||
let user_id = "archive_user";
|
||||
let other_user_id = "other_user";
|
||||
|
||||
let payload = IngestionPayload::Text {
|
||||
text: "One".to_string(),
|
||||
context: "Context".to_string(),
|
||||
category: "Category".to_string(),
|
||||
user_id: user_id.to_string(),
|
||||
};
|
||||
|
||||
// Oldest task
|
||||
let mut first = IngestionTask::new(payload.clone(), user_id.to_string());
|
||||
first.created_at = first.created_at - chrono::Duration::minutes(1);
|
||||
first.updated_at = first.created_at;
|
||||
first.state = TaskState::Succeeded;
|
||||
db.store_item(first.clone()).await.expect("store first");
|
||||
|
||||
// Latest task
|
||||
let mut second = IngestionTask::new(payload.clone(), user_id.to_string());
|
||||
second.state = TaskState::Processing;
|
||||
db.store_item(second.clone()).await.expect("store second");
|
||||
|
||||
let other_payload = IngestionPayload::Text {
|
||||
text: "Other".to_string(),
|
||||
context: "Context".to_string(),
|
||||
category: "Category".to_string(),
|
||||
user_id: other_user_id.to_string(),
|
||||
};
|
||||
let other_task = IngestionTask::new(other_payload, other_user_id.to_string());
|
||||
db.store_item(other_task).await.expect("store other");
|
||||
|
||||
let tasks = User::get_all_ingestion_tasks(user_id, &db)
|
||||
.await
|
||||
.expect("fetch all tasks");
|
||||
|
||||
assert_eq!(tasks.len(), 2);
|
||||
assert_eq!(tasks[0].id, second.id); // newest first
|
||||
assert_eq!(tasks[1].id, first.id);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
@@ -775,6 +944,7 @@ mod tests {
|
||||
password.to_string(),
|
||||
&db,
|
||||
"UTC".to_string(),
|
||||
"system".to_string(),
|
||||
)
|
||||
.await
|
||||
.expect("Failed to create user");
|
||||
@@ -809,6 +979,7 @@ mod tests {
|
||||
password.to_string(),
|
||||
&db,
|
||||
"UTC".to_string(),
|
||||
"system".to_string(),
|
||||
)
|
||||
.await
|
||||
.expect("Failed to create user");
|
||||
@@ -861,6 +1032,42 @@ mod tests {
|
||||
assert!(not_found.is_none());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_set_api_key_with_none_theme() {
|
||||
let db = setup_test_db().await;
|
||||
|
||||
let user = User::create_new(
|
||||
"legacy_theme@example.com".to_string(),
|
||||
"apikey_password".to_string(),
|
||||
&db,
|
||||
"UTC".to_string(),
|
||||
"system".to_string(),
|
||||
)
|
||||
.await
|
||||
.expect("Failed to create user");
|
||||
|
||||
db.client
|
||||
.query("UPDATE type::thing('user', $id) SET theme = NONE")
|
||||
.bind(("id", user.id.clone()))
|
||||
.await
|
||||
.expect("Failed to set user theme to NONE");
|
||||
|
||||
let api_key = User::set_api_key(&user.id, &db)
|
||||
.await
|
||||
.expect("set_api_key should tolerate NONE theme");
|
||||
|
||||
assert!(api_key.starts_with("sk_"));
|
||||
|
||||
let updated_user = db
|
||||
.get_item::<User>(&user.id)
|
||||
.await
|
||||
.expect("Failed to retrieve user")
|
||||
.expect("User should still exist");
|
||||
|
||||
assert_eq!(updated_user.theme, Theme::System);
|
||||
assert_eq!(updated_user.api_key, Some(api_key));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_password_update() {
|
||||
// Setup test database
|
||||
@@ -876,6 +1083,7 @@ mod tests {
|
||||
old_password.to_string(),
|
||||
&db,
|
||||
"UTC".to_string(),
|
||||
"system".to_string(),
|
||||
)
|
||||
.await
|
||||
.expect("Failed to create user");
|
||||
@@ -923,6 +1131,7 @@ mod tests {
|
||||
"password".to_string(),
|
||||
&db,
|
||||
"UTC".to_string(),
|
||||
"system".to_string(),
|
||||
)
|
||||
.await
|
||||
.expect("Failed to create user");
|
||||
@@ -1033,4 +1242,51 @@ mod tests {
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_validate_theme() {
|
||||
assert_eq!(validate_theme("light"), Theme::Light);
|
||||
assert_eq!(validate_theme("dark"), Theme::Dark);
|
||||
assert_eq!(validate_theme("system"), Theme::System);
|
||||
assert_eq!(validate_theme("invalid"), Theme::System);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_theme_update() {
|
||||
let db = setup_test_db().await;
|
||||
let email = "theme_test@example.com";
|
||||
let user = User::create_new(
|
||||
email.to_string(),
|
||||
"password".to_string(),
|
||||
&db,
|
||||
"UTC".to_string(),
|
||||
"system".to_string(),
|
||||
)
|
||||
.await
|
||||
.expect("Failed to create user");
|
||||
|
||||
assert_eq!(user.theme, Theme::System);
|
||||
|
||||
User::update_theme(&user.id, "dark", &db)
|
||||
.await
|
||||
.expect("update theme");
|
||||
|
||||
let updated = db
|
||||
.get_item::<User>(&user.id)
|
||||
.await
|
||||
.expect("get user")
|
||||
.unwrap();
|
||||
assert_eq!(updated.theme, Theme::Dark);
|
||||
|
||||
// Invalid theme should default to system (but update_theme calls validate_theme)
|
||||
User::update_theme(&user.id, "invalid", &db)
|
||||
.await
|
||||
.expect("update theme invalid");
|
||||
let updated2 = db
|
||||
.get_item::<User>(&user.id)
|
||||
.await
|
||||
.expect("get user")
|
||||
.unwrap();
|
||||
assert_eq!(updated2.theme, Theme::System);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,16 +1,54 @@
|
||||
use config::{Config, ConfigError, Environment, File};
|
||||
use serde::Deserialize;
|
||||
use std::env;
|
||||
|
||||
#[derive(Clone, Deserialize, Debug)]
|
||||
/// Selects the embedding backend for vector generation.
|
||||
#[derive(Clone, Deserialize, Debug, Default, PartialEq)]
|
||||
#[serde(rename_all = "lowercase")]
|
||||
pub enum EmbeddingBackend {
|
||||
/// Use OpenAI-compatible API for embeddings.
|
||||
OpenAI,
|
||||
/// Use FastEmbed local embeddings (default).
|
||||
#[default]
|
||||
FastEmbed,
|
||||
/// Use deterministic hashed embeddings (for testing).
|
||||
Hashed,
|
||||
}
|
||||
|
||||
#[derive(Clone, Deserialize, Debug, PartialEq)]
|
||||
#[serde(rename_all = "lowercase")]
|
||||
pub enum StorageKind {
|
||||
Local,
|
||||
Memory,
|
||||
S3,
|
||||
}
|
||||
|
||||
/// Default storage backend when none is configured.
|
||||
fn default_storage_kind() -> StorageKind {
|
||||
StorageKind::Local
|
||||
}
|
||||
|
||||
fn default_s3_region() -> Option<String> {
|
||||
Some("us-east-1".to_string())
|
||||
}
|
||||
|
||||
/// Selects the strategy used for PDF ingestion.
|
||||
#[derive(Clone, Deserialize, Debug)]
|
||||
#[serde(rename_all = "kebab-case")]
|
||||
pub enum PdfIngestMode {
|
||||
/// Only rely on classic text extraction (no LLM fallbacks).
|
||||
Classic,
|
||||
/// Prefer fast text extraction, but fall back to the LLM rendering path when needed.
|
||||
LlmFirst,
|
||||
}
|
||||
|
||||
/// Default PDF ingestion mode when unset.
|
||||
fn default_pdf_ingest_mode() -> PdfIngestMode {
|
||||
PdfIngestMode::LlmFirst
|
||||
}
|
||||
|
||||
/// Application configuration loaded from files and environment variables.
|
||||
#[allow(clippy::module_name_repetitions)]
|
||||
#[derive(Clone, Deserialize, Debug)]
|
||||
pub struct AppConfig {
|
||||
pub openai_api_key: String,
|
||||
@@ -26,17 +64,143 @@ pub struct AppConfig {
|
||||
pub openai_base_url: String,
|
||||
#[serde(default = "default_storage_kind")]
|
||||
pub storage: StorageKind,
|
||||
#[serde(default)]
|
||||
pub s3_bucket: Option<String>,
|
||||
#[serde(default)]
|
||||
pub s3_endpoint: Option<String>,
|
||||
#[serde(default = "default_s3_region")]
|
||||
pub s3_region: Option<String>,
|
||||
#[serde(default = "default_pdf_ingest_mode")]
|
||||
pub pdf_ingest_mode: PdfIngestMode,
|
||||
#[serde(default = "default_reranking_enabled")]
|
||||
pub reranking_enabled: bool,
|
||||
#[serde(default)]
|
||||
pub reranking_pool_size: Option<usize>,
|
||||
#[serde(default)]
|
||||
pub fastembed_cache_dir: Option<String>,
|
||||
#[serde(default)]
|
||||
pub fastembed_show_download_progress: Option<bool>,
|
||||
#[serde(default)]
|
||||
pub fastembed_max_length: Option<usize>,
|
||||
#[serde(default)]
|
||||
pub retrieval_strategy: Option<String>,
|
||||
#[serde(default)]
|
||||
pub embedding_backend: EmbeddingBackend,
|
||||
#[serde(default = "default_ingest_max_body_bytes")]
|
||||
pub ingest_max_body_bytes: usize,
|
||||
#[serde(default = "default_ingest_max_files")]
|
||||
pub ingest_max_files: usize,
|
||||
#[serde(default = "default_ingest_max_content_bytes")]
|
||||
pub ingest_max_content_bytes: usize,
|
||||
#[serde(default = "default_ingest_max_context_bytes")]
|
||||
pub ingest_max_context_bytes: usize,
|
||||
#[serde(default = "default_ingest_max_category_bytes")]
|
||||
pub ingest_max_category_bytes: usize,
|
||||
}
|
||||
|
||||
/// Default data directory for persisted assets.
|
||||
fn default_data_dir() -> String {
|
||||
"./data".to_string()
|
||||
}
|
||||
|
||||
/// Default base URL used for OpenAI-compatible APIs.
|
||||
fn default_base_url() -> String {
|
||||
"https://api.openai.com/v1".to_string()
|
||||
}
|
||||
|
||||
/// Whether reranking is enabled by default.
|
||||
fn default_reranking_enabled() -> bool {
|
||||
false
|
||||
}
|
||||
|
||||
fn default_ingest_max_body_bytes() -> usize {
|
||||
20_000_000
|
||||
}
|
||||
|
||||
fn default_ingest_max_files() -> usize {
|
||||
5
|
||||
}
|
||||
|
||||
fn default_ingest_max_content_bytes() -> usize {
|
||||
262_144
|
||||
}
|
||||
|
||||
fn default_ingest_max_context_bytes() -> usize {
|
||||
16_384
|
||||
}
|
||||
|
||||
fn default_ingest_max_category_bytes() -> usize {
|
||||
128
|
||||
}
|
||||
|
||||
pub fn ensure_ort_path() {
|
||||
if env::var_os("ORT_DYLIB_PATH").is_some() {
|
||||
return;
|
||||
}
|
||||
if let Ok(mut exe) = env::current_exe() {
|
||||
exe.pop();
|
||||
|
||||
if cfg!(target_os = "windows") {
|
||||
for p in [
|
||||
exe.join("onnxruntime.dll"),
|
||||
exe.join("lib").join("onnxruntime.dll"),
|
||||
] {
|
||||
if p.exists() {
|
||||
env::set_var("ORT_DYLIB_PATH", p);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
let name = if cfg!(target_os = "macos") {
|
||||
"libonnxruntime.dylib"
|
||||
} else {
|
||||
"libonnxruntime.so"
|
||||
};
|
||||
let p = exe.join("lib").join(name);
|
||||
if p.exists() {
|
||||
env::set_var("ORT_DYLIB_PATH", p);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for AppConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
openai_api_key: String::new(),
|
||||
surrealdb_address: String::new(),
|
||||
surrealdb_username: String::new(),
|
||||
surrealdb_password: String::new(),
|
||||
surrealdb_namespace: String::new(),
|
||||
surrealdb_database: String::new(),
|
||||
data_dir: default_data_dir(),
|
||||
http_port: 0,
|
||||
openai_base_url: default_base_url(),
|
||||
storage: default_storage_kind(),
|
||||
s3_bucket: None,
|
||||
s3_endpoint: None,
|
||||
s3_region: default_s3_region(),
|
||||
pdf_ingest_mode: default_pdf_ingest_mode(),
|
||||
reranking_enabled: default_reranking_enabled(),
|
||||
reranking_pool_size: None,
|
||||
fastembed_cache_dir: None,
|
||||
fastembed_show_download_progress: None,
|
||||
fastembed_max_length: None,
|
||||
retrieval_strategy: None,
|
||||
embedding_backend: EmbeddingBackend::default(),
|
||||
ingest_max_body_bytes: default_ingest_max_body_bytes(),
|
||||
ingest_max_files: default_ingest_max_files(),
|
||||
ingest_max_content_bytes: default_ingest_max_content_bytes(),
|
||||
ingest_max_context_bytes: default_ingest_max_context_bytes(),
|
||||
ingest_max_category_bytes: default_ingest_max_category_bytes(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Loads the application configuration from the environment and optional config file.
|
||||
#[allow(clippy::module_name_repetitions)]
|
||||
pub fn get_config() -> Result<AppConfig, ConfigError> {
|
||||
ensure_ort_path();
|
||||
|
||||
let config = Config::builder()
|
||||
.add_source(File::with_name("config").required(false))
|
||||
.add_source(Environment::default())
|
||||
|
||||
@@ -1,19 +1,328 @@
|
||||
use async_openai::types::CreateEmbeddingRequestArgs;
|
||||
use std::{
|
||||
collections::hash_map::DefaultHasher,
|
||||
hash::{Hash, Hasher},
|
||||
str::FromStr,
|
||||
sync::Arc,
|
||||
};
|
||||
|
||||
use anyhow::{anyhow, Context, Result};
|
||||
use async_openai::{types::CreateEmbeddingRequestArgs, Client};
|
||||
use fastembed::{EmbeddingModel, ModelTrait, TextEmbedding, TextInitOptions};
|
||||
use tokio::sync::Mutex;
|
||||
use tracing::debug;
|
||||
|
||||
use crate::{
|
||||
error::AppError,
|
||||
storage::{db::SurrealDbClient, types::system_settings::SystemSettings},
|
||||
};
|
||||
/// Generates an embedding vector for the given input text using OpenAI's embedding model.
|
||||
|
||||
/// Supported embedding backends.
|
||||
#[allow(clippy::module_name_repetitions)]
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
|
||||
pub enum EmbeddingBackend {
|
||||
#[default]
|
||||
OpenAI,
|
||||
FastEmbed,
|
||||
Hashed,
|
||||
}
|
||||
|
||||
impl std::str::FromStr for EmbeddingBackend {
|
||||
type Err = anyhow::Error;
|
||||
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
match s.to_ascii_lowercase().as_str() {
|
||||
"openai" => Ok(Self::OpenAI),
|
||||
"hashed" => Ok(Self::Hashed),
|
||||
"fastembed" | "fast-embed" | "fast" => Ok(Self::FastEmbed),
|
||||
other => Err(anyhow!(
|
||||
"unknown embedding backend '{other}'. Expected 'openai', 'hashed', or 'fastembed'."
|
||||
)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Wrapper around the chosen embedding backend.
|
||||
#[allow(clippy::module_name_repetitions)]
|
||||
#[derive(Clone)]
|
||||
pub struct EmbeddingProvider {
|
||||
/// Concrete backend implementation.
|
||||
inner: EmbeddingInner,
|
||||
}
|
||||
|
||||
/// Concrete embedding implementations.
|
||||
#[derive(Clone)]
|
||||
enum EmbeddingInner {
|
||||
/// Uses an `OpenAI`-compatible API.
|
||||
OpenAI {
|
||||
/// Client used to issue embedding requests.
|
||||
client: Arc<Client<async_openai::config::OpenAIConfig>>,
|
||||
/// Model identifier for the API.
|
||||
model: String,
|
||||
/// Expected output dimensions.
|
||||
dimensions: u32,
|
||||
},
|
||||
/// Generates deterministic hashed embeddings without external calls.
|
||||
Hashed {
|
||||
/// Output vector length.
|
||||
dimension: usize,
|
||||
},
|
||||
/// Uses `FastEmbed` running locally.
|
||||
FastEmbed {
|
||||
/// Shared `FastEmbed` model.
|
||||
model: Arc<Mutex<TextEmbedding>>,
|
||||
/// Model metadata used for info logging.
|
||||
model_name: EmbeddingModel,
|
||||
/// Output vector length.
|
||||
dimension: usize,
|
||||
},
|
||||
}
|
||||
|
||||
impl EmbeddingProvider {
|
||||
pub fn backend_label(&self) -> &'static str {
|
||||
match self.inner {
|
||||
EmbeddingInner::Hashed { .. } => "hashed",
|
||||
EmbeddingInner::FastEmbed { .. } => "fastembed",
|
||||
EmbeddingInner::OpenAI { .. } => "openai",
|
||||
}
|
||||
}
|
||||
|
||||
pub fn dimension(&self) -> usize {
|
||||
match &self.inner {
|
||||
EmbeddingInner::Hashed { dimension } | EmbeddingInner::FastEmbed { dimension, .. } => {
|
||||
*dimension
|
||||
}
|
||||
EmbeddingInner::OpenAI { dimensions, .. } => *dimensions as usize,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn model_code(&self) -> Option<String> {
|
||||
match &self.inner {
|
||||
EmbeddingInner::FastEmbed { model_name, .. } => Some(model_name.to_string()),
|
||||
EmbeddingInner::OpenAI { model, .. } => Some(model.clone()),
|
||||
EmbeddingInner::Hashed { .. } => None,
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn embed(&self, text: &str) -> Result<Vec<f32>> {
|
||||
match &self.inner {
|
||||
EmbeddingInner::Hashed { dimension } => Ok(hashed_embedding(text, *dimension)),
|
||||
EmbeddingInner::FastEmbed { model, .. } => {
|
||||
let mut guard = model.lock().await;
|
||||
let embeddings = guard
|
||||
.embed(vec![text.to_owned()], None)
|
||||
.context("generating fastembed vector")?;
|
||||
embeddings
|
||||
.into_iter()
|
||||
.next()
|
||||
.ok_or_else(|| anyhow!("fastembed returned no embedding for input"))
|
||||
}
|
||||
EmbeddingInner::OpenAI {
|
||||
client,
|
||||
model,
|
||||
dimensions,
|
||||
} => {
|
||||
let request = CreateEmbeddingRequestArgs::default()
|
||||
.model(model.clone())
|
||||
.input([text])
|
||||
.dimensions(*dimensions)
|
||||
.build()?;
|
||||
|
||||
let response = client.embeddings().create(request).await?;
|
||||
|
||||
let embedding = response
|
||||
.data
|
||||
.first()
|
||||
.ok_or_else(|| anyhow!("No embedding data received from OpenAI API"))?
|
||||
.embedding
|
||||
.clone();
|
||||
|
||||
Ok(embedding)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn embed_batch(&self, texts: Vec<String>) -> Result<Vec<Vec<f32>>> {
|
||||
match &self.inner {
|
||||
EmbeddingInner::Hashed { dimension } => Ok(texts
|
||||
.into_iter()
|
||||
.map(|text| hashed_embedding(&text, *dimension))
|
||||
.collect()),
|
||||
EmbeddingInner::FastEmbed { model, .. } => {
|
||||
if texts.is_empty() {
|
||||
return Ok(Vec::new());
|
||||
}
|
||||
let mut guard = model.lock().await;
|
||||
guard
|
||||
.embed(texts, None)
|
||||
.context("generating fastembed batch embeddings")
|
||||
}
|
||||
EmbeddingInner::OpenAI {
|
||||
client,
|
||||
model,
|
||||
dimensions,
|
||||
} => {
|
||||
if texts.is_empty() {
|
||||
return Ok(Vec::new());
|
||||
}
|
||||
|
||||
let request = CreateEmbeddingRequestArgs::default()
|
||||
.model(model.clone())
|
||||
.input(texts)
|
||||
.dimensions(*dimensions)
|
||||
.build()?;
|
||||
|
||||
let response = client.embeddings().create(request).await?;
|
||||
|
||||
let embeddings: Vec<Vec<f32>> = response
|
||||
.data
|
||||
.into_iter()
|
||||
.map(|item| item.embedding)
|
||||
.collect();
|
||||
|
||||
Ok(embeddings)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn new_openai(
|
||||
client: Arc<Client<async_openai::config::OpenAIConfig>>,
|
||||
model: String,
|
||||
dimensions: u32,
|
||||
) -> Result<Self> {
|
||||
Ok(Self {
|
||||
inner: EmbeddingInner::OpenAI {
|
||||
client,
|
||||
model,
|
||||
dimensions,
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
pub async fn new_fastembed(model_override: Option<String>) -> Result<Self> {
|
||||
let model_name = if let Some(code) = model_override {
|
||||
EmbeddingModel::from_str(&code).map_err(|err| anyhow!(err))?
|
||||
} else {
|
||||
EmbeddingModel::default()
|
||||
};
|
||||
|
||||
let options = TextInitOptions::new(model_name.clone()).with_show_download_progress(true);
|
||||
let model_name_for_task = model_name.clone();
|
||||
let model_name_code = model_name.to_string();
|
||||
|
||||
let (model, dimension) = tokio::task::spawn_blocking(move || -> Result<_> {
|
||||
let model =
|
||||
TextEmbedding::try_new(options).context("initialising FastEmbed text model")?;
|
||||
let info = EmbeddingModel::get_model_info(&model_name_for_task)
|
||||
.ok_or_else(|| anyhow!("FastEmbed model metadata missing for {model_name_code}"))?;
|
||||
Ok((model, info.dim))
|
||||
})
|
||||
.await
|
||||
.context("joining FastEmbed initialisation task")??;
|
||||
|
||||
Ok(EmbeddingProvider {
|
||||
inner: EmbeddingInner::FastEmbed {
|
||||
model: Arc::new(Mutex::new(model)),
|
||||
model_name,
|
||||
dimension,
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
pub fn new_hashed(dimension: usize) -> Result<Self> {
|
||||
Ok(EmbeddingProvider {
|
||||
inner: EmbeddingInner::Hashed {
|
||||
dimension: dimension.max(1),
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
/// Creates an embedding provider based on application configuration.
|
||||
///
|
||||
/// Dispatches to the appropriate constructor based on `config.embedding_backend`:
|
||||
/// - `OpenAI`: Requires a valid OpenAI client
|
||||
/// - `FastEmbed`: Uses local embedding model
|
||||
/// - `Hashed`: Uses deterministic hashed embeddings (for testing)
|
||||
pub async fn from_config(
|
||||
config: &crate::utils::config::AppConfig,
|
||||
openai_client: Option<Arc<Client<async_openai::config::OpenAIConfig>>>,
|
||||
) -> Result<Self> {
|
||||
use crate::utils::config::EmbeddingBackend;
|
||||
|
||||
match config.embedding_backend {
|
||||
EmbeddingBackend::OpenAI => {
|
||||
let client = openai_client
|
||||
.ok_or_else(|| anyhow!("OpenAI embedding backend requires an OpenAI client"))?;
|
||||
// Use defaults that match SystemSettings initial values
|
||||
Self::new_openai(client, "text-embedding-3-small".to_string(), 1536)
|
||||
}
|
||||
EmbeddingBackend::FastEmbed => {
|
||||
// Use nomic-embed-text-v1.5 as the default FastEmbed model
|
||||
Self::new_fastembed(Some("nomic-ai/nomic-embed-text-v1.5".to_string())).await
|
||||
}
|
||||
EmbeddingBackend::Hashed => Self::new_hashed(384),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Helper functions for hashed embeddings
|
||||
/// Generates a hashed embedding vector without external dependencies.
|
||||
fn hashed_embedding(text: &str, dimension: usize) -> Vec<f32> {
|
||||
let dim = dimension.max(1);
|
||||
let mut vector = vec![0.0f32; dim];
|
||||
if text.is_empty() {
|
||||
return vector;
|
||||
}
|
||||
|
||||
for token in tokens(text) {
|
||||
let idx = bucket(&token, dim);
|
||||
if let Some(slot) = vector.get_mut(idx) {
|
||||
*slot += 1.0;
|
||||
}
|
||||
}
|
||||
|
||||
let norm = vector.iter().map(|v| v * v).sum::<f32>().sqrt();
|
||||
if norm > 0.0 {
|
||||
for value in &mut vector {
|
||||
*value /= norm;
|
||||
}
|
||||
}
|
||||
|
||||
vector
|
||||
}
|
||||
|
||||
/// Tokenizes the text into alphanumeric lowercase tokens.
|
||||
fn tokens(text: &str) -> impl Iterator<Item = String> + '_ {
|
||||
text.split(|c: char| !c.is_ascii_alphanumeric())
|
||||
.filter(|token| !token.is_empty())
|
||||
.map(str::to_ascii_lowercase)
|
||||
}
|
||||
|
||||
/// Buckets a token into the hashed embedding vector.
|
||||
#[allow(clippy::arithmetic_side_effects)]
|
||||
fn bucket(token: &str, dimension: usize) -> usize {
|
||||
let safe_dimension = dimension.max(1);
|
||||
let mut hasher = DefaultHasher::new();
|
||||
token.hash(&mut hasher);
|
||||
usize::try_from(hasher.finish()).unwrap_or_default() % safe_dimension
|
||||
}
|
||||
|
||||
// Backward compatibility function
|
||||
pub async fn generate_embedding_with_provider(
|
||||
provider: &EmbeddingProvider,
|
||||
input: &str,
|
||||
) -> Result<Vec<f32>, AppError> {
|
||||
provider.embed(input).await.map_err(AppError::from)
|
||||
}
|
||||
|
||||
/// Generates an embedding vector for the given input text using `OpenAI`'s embedding model.
|
||||
///
|
||||
/// This function takes a text input and converts it into a numerical vector representation (embedding)
|
||||
/// using OpenAI's text-embedding-3-small model. These embeddings can be used for semantic similarity
|
||||
/// using `OpenAI`'s text-embedding-3-small model. These embeddings can be used for semantic similarity
|
||||
/// comparisons, vector search, and other natural language processing tasks.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `client`: The OpenAI client instance used to make API requests.
|
||||
/// * `client`: The `OpenAI` client instance used to make API requests.
|
||||
/// * `input`: The text string to generate embeddings for.
|
||||
///
|
||||
/// # Returns
|
||||
@@ -25,9 +334,10 @@ use crate::{
|
||||
/// # Errors
|
||||
///
|
||||
/// This function can return a `AppError` in the following cases:
|
||||
/// * If the OpenAI API request fails
|
||||
/// * If the `OpenAI` API request fails
|
||||
/// * If the request building fails
|
||||
/// * If no embedding data is received in the response
|
||||
#[allow(clippy::module_name_repetitions)]
|
||||
pub async fn generate_embedding(
|
||||
client: &async_openai::Client<async_openai::config::OpenAIConfig>,
|
||||
input: &str,
|
||||
|
||||
113
common/src/utils/ingest_limits.rs
Normal file
113
common/src/utils/ingest_limits.rs
Normal file
@@ -0,0 +1,113 @@
|
||||
use super::config::AppConfig;
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub enum IngestValidationError {
|
||||
PayloadTooLarge(String),
|
||||
BadRequest(String),
|
||||
}
|
||||
|
||||
pub fn validate_ingest_input(
|
||||
config: &AppConfig,
|
||||
content: Option<&str>,
|
||||
context: &str,
|
||||
category: &str,
|
||||
file_count: usize,
|
||||
) -> Result<(), IngestValidationError> {
|
||||
if file_count > config.ingest_max_files {
|
||||
return Err(IngestValidationError::BadRequest(format!(
|
||||
"Too many files. Maximum allowed is {}",
|
||||
config.ingest_max_files
|
||||
)));
|
||||
}
|
||||
|
||||
if let Some(content) = content {
|
||||
if content.len() > config.ingest_max_content_bytes {
|
||||
return Err(IngestValidationError::PayloadTooLarge(format!(
|
||||
"Content is too large. Maximum allowed is {} bytes",
|
||||
config.ingest_max_content_bytes
|
||||
)));
|
||||
}
|
||||
}
|
||||
|
||||
if context.len() > config.ingest_max_context_bytes {
|
||||
return Err(IngestValidationError::PayloadTooLarge(format!(
|
||||
"Context is too large. Maximum allowed is {} bytes",
|
||||
config.ingest_max_context_bytes
|
||||
)));
|
||||
}
|
||||
|
||||
if category.len() > config.ingest_max_category_bytes {
|
||||
return Err(IngestValidationError::PayloadTooLarge(format!(
|
||||
"Category is too large. Maximum allowed is {} bytes",
|
||||
config.ingest_max_category_bytes
|
||||
)));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn validate_ingest_input_rejects_too_many_files() {
|
||||
let config = AppConfig {
|
||||
ingest_max_files: 1,
|
||||
..Default::default()
|
||||
};
|
||||
let result = validate_ingest_input(&config, Some("ok"), "ctx", "cat", 2);
|
||||
|
||||
assert!(matches!(result, Err(IngestValidationError::BadRequest(_))));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn validate_ingest_input_rejects_oversized_content() {
|
||||
let config = AppConfig {
|
||||
ingest_max_content_bytes: 4,
|
||||
..Default::default()
|
||||
};
|
||||
let result = validate_ingest_input(&config, Some("12345"), "ctx", "cat", 0);
|
||||
|
||||
assert!(matches!(
|
||||
result,
|
||||
Err(IngestValidationError::PayloadTooLarge(_))
|
||||
));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn validate_ingest_input_rejects_oversized_context() {
|
||||
let config = AppConfig {
|
||||
ingest_max_context_bytes: 2,
|
||||
..Default::default()
|
||||
};
|
||||
let result = validate_ingest_input(&config, None, "long", "cat", 0);
|
||||
|
||||
assert!(matches!(
|
||||
result,
|
||||
Err(IngestValidationError::PayloadTooLarge(_))
|
||||
));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn validate_ingest_input_rejects_oversized_category() {
|
||||
let config = AppConfig {
|
||||
ingest_max_category_bytes: 2,
|
||||
..Default::default()
|
||||
};
|
||||
let result = validate_ingest_input(&config, None, "ok", "long", 0);
|
||||
|
||||
assert!(matches!(
|
||||
result,
|
||||
Err(IngestValidationError::PayloadTooLarge(_))
|
||||
));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn validate_ingest_input_accepts_valid_payload() {
|
||||
let config = AppConfig::default();
|
||||
let result = validate_ingest_input(&config, Some("ok"), "ctx", "cat", 1);
|
||||
|
||||
assert!(result.is_ok());
|
||||
}
|
||||
}
|
||||
@@ -1,3 +1,4 @@
|
||||
pub mod config;
|
||||
pub mod embedding;
|
||||
pub mod ingest_limits;
|
||||
pub mod template_engine;
|
||||
|
||||
@@ -4,6 +4,7 @@ pub use minijinja_contrib;
|
||||
pub use minijinja_embed;
|
||||
use std::sync::Arc;
|
||||
|
||||
#[allow(clippy::module_name_repetitions)]
|
||||
pub trait ProvidesTemplateEngine {
|
||||
fn template_engine(&self) -> &Arc<TemplateEngine>;
|
||||
}
|
||||
@@ -20,19 +21,49 @@ pub enum TemplateEngine {
|
||||
|
||||
#[macro_export]
|
||||
macro_rules! create_template_engine {
|
||||
// Macro takes the relative path to the templates dir as input
|
||||
($relative_path:expr) => {{
|
||||
// Single path argument
|
||||
($relative_path:expr) => {
|
||||
$crate::create_template_engine!($relative_path, Option::<&str>::None)
|
||||
};
|
||||
|
||||
// Path + Fallback argument
|
||||
($relative_path:expr, $fallback_path:expr) => {{
|
||||
// Code for debug builds (AutoReload)
|
||||
#[cfg(debug_assertions)]
|
||||
{
|
||||
// These lines execute in the CALLING crate's context
|
||||
let crate_dir = std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR"));
|
||||
let template_path = crate_dir.join($relative_path);
|
||||
let fallback_path = $fallback_path.map(|p| crate_dir.join(p));
|
||||
|
||||
let reloader = $crate::utils::template_engine::AutoReloader::new(move |notifier| {
|
||||
let mut env = $crate::utils::template_engine::Environment::new();
|
||||
env.set_loader($crate::utils::template_engine::path_loader(&template_path));
|
||||
|
||||
let loader_primary = $crate::utils::template_engine::path_loader(&template_path);
|
||||
|
||||
// Clone fallback_path for the closure
|
||||
let fallback = fallback_path.clone();
|
||||
|
||||
env.set_loader(move |name| match loader_primary(name) {
|
||||
Ok(Some(tmpl)) => Ok(Some(tmpl)),
|
||||
Ok(None) => {
|
||||
if let Some(ref fb_path) = fallback {
|
||||
let loader_fallback =
|
||||
$crate::utils::template_engine::path_loader(fb_path);
|
||||
loader_fallback(name)
|
||||
} else {
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
Err(e) => Err(e),
|
||||
});
|
||||
|
||||
notifier.set_fast_reload(true);
|
||||
notifier.watch_path(&template_path, true);
|
||||
if let Some(ref fb) = fallback_path {
|
||||
notifier.watch_path(fb, true);
|
||||
}
|
||||
|
||||
// Add contrib filters/functions
|
||||
$crate::utils::template_engine::minijinja_contrib::add_to_environment(&mut env);
|
||||
Ok(env)
|
||||
@@ -59,13 +90,13 @@ impl TemplateEngine {
|
||||
match self {
|
||||
// Only compile this arm for debug builds
|
||||
#[cfg(debug_assertions)]
|
||||
TemplateEngine::AutoReload(reloader) => {
|
||||
Self::AutoReload(reloader) => {
|
||||
let env = reloader.acquire_env()?;
|
||||
env.get_template(name)?.render(ctx)
|
||||
}
|
||||
// Only compile this arm for release builds
|
||||
#[cfg(not(debug_assertions))]
|
||||
TemplateEngine::Embedded(env) => env.get_template(name)?.render(ctx),
|
||||
Self::Embedded(env) => env.get_template(name)?.render(ctx),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -78,19 +109,17 @@ impl TemplateEngine {
|
||||
match self {
|
||||
// Only compile this arm for debug builds
|
||||
#[cfg(debug_assertions)]
|
||||
TemplateEngine::AutoReload(reloader) => {
|
||||
let env = reloader.acquire_env()?;
|
||||
let template = env.get_template(template_name)?;
|
||||
let mut state = template.eval_to_state(context)?;
|
||||
state.render_block(block_name)
|
||||
}
|
||||
Self::AutoReload(reloader) => reloader
|
||||
.acquire_env()?
|
||||
.get_template(template_name)?
|
||||
.eval_to_state(context)?
|
||||
.render_block(block_name),
|
||||
// Only compile this arm for release builds
|
||||
#[cfg(not(debug_assertions))]
|
||||
TemplateEngine::Embedded(env) => {
|
||||
let template = env.get_template(template_name)?;
|
||||
let mut state = template.eval_to_state(context)?;
|
||||
state.render_block(block_name)
|
||||
}
|
||||
Self::Embedded(env) => env
|
||||
.get_template(template_name)?
|
||||
.eval_to_state(context)?
|
||||
.render_block(block_name),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,340 +0,0 @@
|
||||
use surrealdb::Error;
|
||||
use tracing::debug;
|
||||
|
||||
use common::storage::{db::SurrealDbClient, types::knowledge_entity::KnowledgeEntity};
|
||||
|
||||
/// Retrieves database entries that match a specific source identifier.
|
||||
///
|
||||
/// This function queries the database for all records in a specified table that have
|
||||
/// a matching `source_id` field. It's commonly used to find related entities or
|
||||
/// track the origin of database entries.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `source_id` - The identifier to search for in the database
|
||||
/// * `table_name` - The name of the table to search in
|
||||
/// * `db_client` - The SurrealDB client instance for database operations
|
||||
///
|
||||
/// # Type Parameters
|
||||
///
|
||||
/// * `T` - The type to deserialize the query results into. Must implement `serde::Deserialize`
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns a `Result` containing either:
|
||||
/// * `Ok(Vec<T>)` - A vector of matching records deserialized into type `T`
|
||||
/// * `Err(Error)` - An error if the database query fails
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// This function will return a `Error` if:
|
||||
/// * The database query fails to execute
|
||||
/// * The results cannot be deserialized into type `T`
|
||||
pub async fn find_entities_by_source_ids<T>(
|
||||
source_id: Vec<String>,
|
||||
table_name: String,
|
||||
db: &SurrealDbClient,
|
||||
) -> Result<Vec<T>, Error>
|
||||
where
|
||||
T: for<'de> serde::Deserialize<'de>,
|
||||
{
|
||||
let query = "SELECT * FROM type::table($table) WHERE source_id IN $source_ids";
|
||||
|
||||
db.query(query)
|
||||
.bind(("table", table_name))
|
||||
.bind(("source_ids", source_id))
|
||||
.await?
|
||||
.take(0)
|
||||
}
|
||||
|
||||
/// Find entities by their relationship to the id
|
||||
pub async fn find_entities_by_relationship_by_id(
|
||||
db: &SurrealDbClient,
|
||||
entity_id: String,
|
||||
) -> Result<Vec<KnowledgeEntity>, Error> {
|
||||
let query = format!(
|
||||
"SELECT *, <-> relates_to <-> knowledge_entity AS related FROM knowledge_entity:`{}`",
|
||||
entity_id
|
||||
);
|
||||
|
||||
debug!("{}", query);
|
||||
|
||||
db.query(query).await?.take(0)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use common::storage::types::knowledge_entity::{KnowledgeEntity, KnowledgeEntityType};
|
||||
use common::storage::types::knowledge_relationship::KnowledgeRelationship;
|
||||
use common::storage::types::StoredObject;
|
||||
use uuid::Uuid;
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_find_entities_by_source_ids() {
|
||||
// Setup in-memory database for testing
|
||||
let namespace = "test_ns";
|
||||
let database = &Uuid::new_v4().to_string();
|
||||
let db = SurrealDbClient::memory(namespace, database)
|
||||
.await
|
||||
.expect("Failed to start in-memory surrealdb");
|
||||
|
||||
// Create some test entities with different source_ids
|
||||
let source_id1 = "source123".to_string();
|
||||
let source_id2 = "source456".to_string();
|
||||
let source_id3 = "source789".to_string();
|
||||
|
||||
let entity_type = KnowledgeEntityType::Document;
|
||||
let embedding = vec![0.1, 0.2, 0.3];
|
||||
let user_id = "user123".to_string();
|
||||
|
||||
// Entity with source_id1
|
||||
let entity1 = KnowledgeEntity::new(
|
||||
source_id1.clone(),
|
||||
"Entity 1".to_string(),
|
||||
"Description 1".to_string(),
|
||||
entity_type.clone(),
|
||||
None,
|
||||
embedding.clone(),
|
||||
user_id.clone(),
|
||||
);
|
||||
|
||||
// Entity with source_id2
|
||||
let entity2 = KnowledgeEntity::new(
|
||||
source_id2.clone(),
|
||||
"Entity 2".to_string(),
|
||||
"Description 2".to_string(),
|
||||
entity_type.clone(),
|
||||
None,
|
||||
embedding.clone(),
|
||||
user_id.clone(),
|
||||
);
|
||||
|
||||
// Another entity with source_id1
|
||||
let entity3 = KnowledgeEntity::new(
|
||||
source_id1.clone(),
|
||||
"Entity 3".to_string(),
|
||||
"Description 3".to_string(),
|
||||
entity_type.clone(),
|
||||
None,
|
||||
embedding.clone(),
|
||||
user_id.clone(),
|
||||
);
|
||||
|
||||
// Entity with source_id3
|
||||
let entity4 = KnowledgeEntity::new(
|
||||
source_id3.clone(),
|
||||
"Entity 4".to_string(),
|
||||
"Description 4".to_string(),
|
||||
entity_type.clone(),
|
||||
None,
|
||||
embedding.clone(),
|
||||
user_id.clone(),
|
||||
);
|
||||
|
||||
// Store all entities
|
||||
db.store_item(entity1.clone())
|
||||
.await
|
||||
.expect("Failed to store entity 1");
|
||||
db.store_item(entity2.clone())
|
||||
.await
|
||||
.expect("Failed to store entity 2");
|
||||
db.store_item(entity3.clone())
|
||||
.await
|
||||
.expect("Failed to store entity 3");
|
||||
db.store_item(entity4.clone())
|
||||
.await
|
||||
.expect("Failed to store entity 4");
|
||||
|
||||
// Test finding entities by multiple source_ids
|
||||
let source_ids = vec![source_id1.clone(), source_id2.clone()];
|
||||
let found_entities: Vec<KnowledgeEntity> =
|
||||
find_entities_by_source_ids(source_ids, KnowledgeEntity::table_name().to_string(), &db)
|
||||
.await
|
||||
.expect("Failed to find entities by source_ids");
|
||||
|
||||
// Should find 3 entities (2 with source_id1, 1 with source_id2)
|
||||
assert_eq!(
|
||||
found_entities.len(),
|
||||
3,
|
||||
"Should find 3 entities with the specified source_ids"
|
||||
);
|
||||
|
||||
// Check that entities with source_id1 and source_id2 are found
|
||||
let found_source_ids: Vec<String> =
|
||||
found_entities.iter().map(|e| e.source_id.clone()).collect();
|
||||
assert!(
|
||||
found_source_ids.contains(&source_id1),
|
||||
"Should find entities with source_id1"
|
||||
);
|
||||
assert!(
|
||||
found_source_ids.contains(&source_id2),
|
||||
"Should find entities with source_id2"
|
||||
);
|
||||
assert!(
|
||||
!found_source_ids.contains(&source_id3),
|
||||
"Should not find entities with source_id3"
|
||||
);
|
||||
|
||||
// Test finding entities by a single source_id
|
||||
let single_source_id = vec![source_id1.clone()];
|
||||
let found_entities: Vec<KnowledgeEntity> = find_entities_by_source_ids(
|
||||
single_source_id,
|
||||
KnowledgeEntity::table_name().to_string(),
|
||||
&db,
|
||||
)
|
||||
.await
|
||||
.expect("Failed to find entities by single source_id");
|
||||
|
||||
// Should find 2 entities with source_id1
|
||||
assert_eq!(
|
||||
found_entities.len(),
|
||||
2,
|
||||
"Should find 2 entities with source_id1"
|
||||
);
|
||||
|
||||
// Check that all found entities have source_id1
|
||||
for entity in found_entities {
|
||||
assert_eq!(
|
||||
entity.source_id, source_id1,
|
||||
"All found entities should have source_id1"
|
||||
);
|
||||
}
|
||||
|
||||
// Test finding entities with non-existent source_id
|
||||
let non_existent_source_id = vec!["non_existent_source".to_string()];
|
||||
let found_entities: Vec<KnowledgeEntity> = find_entities_by_source_ids(
|
||||
non_existent_source_id,
|
||||
KnowledgeEntity::table_name().to_string(),
|
||||
&db,
|
||||
)
|
||||
.await
|
||||
.expect("Failed to find entities by non-existent source_id");
|
||||
|
||||
// Should find 0 entities
|
||||
assert_eq!(
|
||||
found_entities.len(),
|
||||
0,
|
||||
"Should find 0 entities with non-existent source_id"
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_find_entities_by_relationship_by_id() {
|
||||
// Setup in-memory database for testing
|
||||
let namespace = "test_ns";
|
||||
let database = &Uuid::new_v4().to_string();
|
||||
let db = SurrealDbClient::memory(namespace, database)
|
||||
.await
|
||||
.expect("Failed to start in-memory surrealdb");
|
||||
|
||||
// Create some test entities
|
||||
let entity_type = KnowledgeEntityType::Document;
|
||||
let embedding = vec![0.1, 0.2, 0.3];
|
||||
let user_id = "user123".to_string();
|
||||
|
||||
// Create the central entity we'll query relationships for
|
||||
let central_entity = KnowledgeEntity::new(
|
||||
"central_source".to_string(),
|
||||
"Central Entity".to_string(),
|
||||
"Central Description".to_string(),
|
||||
entity_type.clone(),
|
||||
None,
|
||||
embedding.clone(),
|
||||
user_id.clone(),
|
||||
);
|
||||
|
||||
// Create related entities
|
||||
let related_entity1 = KnowledgeEntity::new(
|
||||
"related_source1".to_string(),
|
||||
"Related Entity 1".to_string(),
|
||||
"Related Description 1".to_string(),
|
||||
entity_type.clone(),
|
||||
None,
|
||||
embedding.clone(),
|
||||
user_id.clone(),
|
||||
);
|
||||
|
||||
let related_entity2 = KnowledgeEntity::new(
|
||||
"related_source2".to_string(),
|
||||
"Related Entity 2".to_string(),
|
||||
"Related Description 2".to_string(),
|
||||
entity_type.clone(),
|
||||
None,
|
||||
embedding.clone(),
|
||||
user_id.clone(),
|
||||
);
|
||||
|
||||
// Create an unrelated entity
|
||||
let unrelated_entity = KnowledgeEntity::new(
|
||||
"unrelated_source".to_string(),
|
||||
"Unrelated Entity".to_string(),
|
||||
"Unrelated Description".to_string(),
|
||||
entity_type.clone(),
|
||||
None,
|
||||
embedding.clone(),
|
||||
user_id.clone(),
|
||||
);
|
||||
|
||||
// Store all entities
|
||||
let central_entity = db
|
||||
.store_item(central_entity.clone())
|
||||
.await
|
||||
.expect("Failed to store central entity")
|
||||
.unwrap();
|
||||
let related_entity1 = db
|
||||
.store_item(related_entity1.clone())
|
||||
.await
|
||||
.expect("Failed to store related entity 1")
|
||||
.unwrap();
|
||||
let related_entity2 = db
|
||||
.store_item(related_entity2.clone())
|
||||
.await
|
||||
.expect("Failed to store related entity 2")
|
||||
.unwrap();
|
||||
let _unrelated_entity = db
|
||||
.store_item(unrelated_entity.clone())
|
||||
.await
|
||||
.expect("Failed to store unrelated entity")
|
||||
.unwrap();
|
||||
|
||||
// Create relationships
|
||||
let source_id = "relationship_source".to_string();
|
||||
|
||||
// Create relationship 1: central -> related1
|
||||
let relationship1 = KnowledgeRelationship::new(
|
||||
central_entity.id.clone(),
|
||||
related_entity1.id.clone(),
|
||||
user_id.clone(),
|
||||
source_id.clone(),
|
||||
"references".to_string(),
|
||||
);
|
||||
|
||||
// Create relationship 2: central -> related2
|
||||
let relationship2 = KnowledgeRelationship::new(
|
||||
central_entity.id.clone(),
|
||||
related_entity2.id.clone(),
|
||||
user_id.clone(),
|
||||
source_id.clone(),
|
||||
"contains".to_string(),
|
||||
);
|
||||
|
||||
// Store relationships
|
||||
relationship1
|
||||
.store_relationship(&db)
|
||||
.await
|
||||
.expect("Failed to store relationship 1");
|
||||
relationship2
|
||||
.store_relationship(&db)
|
||||
.await
|
||||
.expect("Failed to store relationship 2");
|
||||
|
||||
// Test finding entities related to the central entity
|
||||
let related_entities = find_entities_by_relationship_by_id(&db, central_entity.id.clone())
|
||||
.await
|
||||
.expect("Failed to find entities by relationship");
|
||||
|
||||
// Check that we found relationships
|
||||
assert!(related_entities.len() > 0, "Should find related entities");
|
||||
}
|
||||
}
|
||||
@@ -1,90 +0,0 @@
|
||||
pub mod answer_retrieval;
|
||||
pub mod answer_retrieval_helper;
|
||||
pub mod graph;
|
||||
pub mod vector;
|
||||
|
||||
use common::{
|
||||
error::AppError,
|
||||
storage::{
|
||||
db::SurrealDbClient,
|
||||
types::{knowledge_entity::KnowledgeEntity, text_chunk::TextChunk},
|
||||
},
|
||||
};
|
||||
use futures::future::{try_join, try_join_all};
|
||||
use graph::{find_entities_by_relationship_by_id, find_entities_by_source_ids};
|
||||
use std::collections::HashMap;
|
||||
use vector::find_items_by_vector_similarity;
|
||||
|
||||
/// Performs a comprehensive knowledge entity retrieval using multiple search strategies
|
||||
/// to find the most relevant entities for a given query.
|
||||
///
|
||||
/// # Strategy
|
||||
/// The function employs a three-pronged approach to knowledge retrieval:
|
||||
/// 1. Direct vector similarity search on knowledge entities
|
||||
/// 2. Text chunk similarity search with source entity lookup
|
||||
/// 3. Graph relationship traversal from related entities
|
||||
///
|
||||
/// This combined approach ensures both semantic similarity matches and structurally
|
||||
/// related content are included in the results.
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `db_client` - SurrealDB client for database operations
|
||||
/// * `openai_client` - OpenAI client for vector embeddings generation
|
||||
/// * `query` - The search query string to find relevant knowledge entities
|
||||
/// * 'user_id' - The user id of the current user
|
||||
///
|
||||
/// # Returns
|
||||
/// * `Result<Vec<KnowledgeEntity>, AppError>` - A deduplicated vector of relevant
|
||||
/// knowledge entities, or an error if the retrieval process fails
|
||||
pub async fn retrieve_entities(
|
||||
db_client: &SurrealDbClient,
|
||||
openai_client: &async_openai::Client<async_openai::config::OpenAIConfig>,
|
||||
query: &str,
|
||||
user_id: &str,
|
||||
) -> Result<Vec<KnowledgeEntity>, AppError> {
|
||||
let (items_from_knowledge_entity_similarity, closest_chunks) = try_join(
|
||||
find_items_by_vector_similarity(
|
||||
10,
|
||||
query,
|
||||
db_client,
|
||||
"knowledge_entity",
|
||||
openai_client,
|
||||
user_id,
|
||||
),
|
||||
find_items_by_vector_similarity(5, query, db_client, "text_chunk", openai_client, user_id),
|
||||
)
|
||||
.await?;
|
||||
|
||||
let source_ids = closest_chunks
|
||||
.iter()
|
||||
.map(|chunk: &TextChunk| chunk.source_id.clone())
|
||||
.collect::<Vec<String>>();
|
||||
|
||||
let items_from_text_chunk_similarity: Vec<KnowledgeEntity> =
|
||||
find_entities_by_source_ids(source_ids, "knowledge_entity".to_string(), db_client).await?;
|
||||
|
||||
let items_from_relationships_futures: Vec<_> = items_from_text_chunk_similarity
|
||||
.clone()
|
||||
.into_iter()
|
||||
.map(|entity| find_entities_by_relationship_by_id(db_client, entity.id.clone()))
|
||||
.collect();
|
||||
|
||||
let items_from_relationships = try_join_all(items_from_relationships_futures)
|
||||
.await?
|
||||
.into_iter()
|
||||
.flatten()
|
||||
.collect::<Vec<KnowledgeEntity>>();
|
||||
|
||||
let entities: Vec<KnowledgeEntity> = items_from_knowledge_entity_similarity
|
||||
.into_iter()
|
||||
.chain(items_from_text_chunk_similarity.into_iter())
|
||||
.chain(items_from_relationships.into_iter())
|
||||
.fold(HashMap::new(), |mut map, entity| {
|
||||
map.insert(entity.id.clone(), entity);
|
||||
map
|
||||
})
|
||||
.into_values()
|
||||
.collect();
|
||||
|
||||
Ok(entities)
|
||||
}
|
||||
@@ -1,45 +0,0 @@
|
||||
use common::{error::AppError, storage::db::SurrealDbClient, utils::embedding::generate_embedding};
|
||||
|
||||
/// Compares vectors and retrieves a number of items from the specified table.
|
||||
///
|
||||
/// This function generates embeddings for the input text, constructs a query to find the closest matches in the database,
|
||||
/// and then deserializes the results into the specified type `T`.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `take` - The number of items to retrieve from the database.
|
||||
/// * `input_text` - The text to generate embeddings for.
|
||||
/// * `db_client` - The SurrealDB client to use for querying the database.
|
||||
/// * `table` - The table to query in the database.
|
||||
/// * `openai_client` - The OpenAI client to use for generating embeddings.
|
||||
/// * 'user_id`- The user id of the current user.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// A vector of type `T` containing the closest matches to the input text. Returns a `ProcessingError` if an error occurs.
|
||||
///
|
||||
/// # Type Parameters
|
||||
///
|
||||
/// * `T` - The type to deserialize the query results into. Must implement `serde::Deserialize`.
|
||||
pub async fn find_items_by_vector_similarity<T>(
|
||||
take: u8,
|
||||
input_text: &str,
|
||||
db_client: &SurrealDbClient,
|
||||
table: &str,
|
||||
openai_client: &async_openai::Client<async_openai::config::OpenAIConfig>,
|
||||
user_id: &str,
|
||||
) -> Result<Vec<T>, AppError>
|
||||
where
|
||||
T: for<'de> serde::Deserialize<'de>,
|
||||
{
|
||||
// Generate embeddings
|
||||
let input_embedding = generate_embedding(openai_client, input_text, db_client).await?;
|
||||
|
||||
// Construct the query
|
||||
let closest_query = format!("SELECT *, vector::distance::knn() AS distance FROM {} WHERE user_id = '{}' AND embedding <|{},40|> {:?} ORDER BY distance", table, user_id, take, input_embedding);
|
||||
|
||||
// Perform query and deserialize to struct
|
||||
let closest_entities: Vec<T> = db_client.query(closest_query).await?.take(0)?;
|
||||
|
||||
Ok(closest_entities)
|
||||
}
|
||||
102
devenv.lock
102
devenv.lock
@@ -3,10 +3,10 @@
|
||||
"devenv": {
|
||||
"locked": {
|
||||
"dir": "src/modules",
|
||||
"lastModified": 1746681099,
|
||||
"lastModified": 1771066302,
|
||||
"owner": "cachix",
|
||||
"repo": "devenv",
|
||||
"rev": "a7f2ea275621391209fd702f5ddced32dd56a4e2",
|
||||
"rev": "1b355dec9bddbaddbe4966d6fc30d7aa3af8575b",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
@@ -16,17 +16,35 @@
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"flake-compat": {
|
||||
"flake": false,
|
||||
"fenix": {
|
||||
"inputs": {
|
||||
"nixpkgs": "nixpkgs",
|
||||
"rust-analyzer-src": "rust-analyzer-src"
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1733328505,
|
||||
"owner": "edolstra",
|
||||
"repo": "flake-compat",
|
||||
"rev": "ff81ac966bb2cae68946d5ed5fc4994f96d0ffec",
|
||||
"lastModified": 1771052630,
|
||||
"owner": "nix-community",
|
||||
"repo": "fenix",
|
||||
"rev": "d0555da98576b8611c25df0c208e51e9a182d95f",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "edolstra",
|
||||
"owner": "nix-community",
|
||||
"repo": "fenix",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"flake-compat": {
|
||||
"flake": false,
|
||||
"locked": {
|
||||
"lastModified": 1767039857,
|
||||
"owner": "NixOS",
|
||||
"repo": "flake-compat",
|
||||
"rev": "5edf11c44bc78a0d334f6334cdaf7d60d732daab",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "NixOS",
|
||||
"repo": "flake-compat",
|
||||
"type": "github"
|
||||
}
|
||||
@@ -40,10 +58,10 @@
|
||||
]
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1746537231,
|
||||
"lastModified": 1770726378,
|
||||
"owner": "cachix",
|
||||
"repo": "git-hooks.nix",
|
||||
"rev": "fa466640195d38ec97cf0493d6d6882bc4d14969",
|
||||
"rev": "5eaaedde414f6eb1aea8b8525c466dc37bba95ae",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
@@ -60,10 +78,10 @@
|
||||
]
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1709087332,
|
||||
"lastModified": 1762808025,
|
||||
"owner": "hercules-ci",
|
||||
"repo": "gitignore.nix",
|
||||
"rev": "637db329424fd7e46cf4185293b9cc8c88c95394",
|
||||
"rev": "cb5e3fdca1de58ccbc3ef53de65bd372b48f567c",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
@@ -74,10 +92,25 @@
|
||||
},
|
||||
"nixpkgs": {
|
||||
"locked": {
|
||||
"lastModified": 1746576598,
|
||||
"lastModified": 1771008912,
|
||||
"owner": "nixos",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "b3582c75c7f21ce0b429898980eddbbf05c68e55",
|
||||
"rev": "a82ccc39b39b621151d6732718e3e250109076fa",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "nixos",
|
||||
"ref": "nixos-unstable",
|
||||
"repo": "nixpkgs",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"nixpkgs_2": {
|
||||
"locked": {
|
||||
"lastModified": 1770843696,
|
||||
"owner": "nixos",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "2343bbb58f99267223bc2aac4fc9ea301a155a16",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
@@ -90,11 +123,48 @@
|
||||
"root": {
|
||||
"inputs": {
|
||||
"devenv": "devenv",
|
||||
"fenix": "fenix",
|
||||
"git-hooks": "git-hooks",
|
||||
"nixpkgs": "nixpkgs",
|
||||
"nixpkgs": "nixpkgs_2",
|
||||
"pre-commit-hooks": [
|
||||
"git-hooks"
|
||||
],
|
||||
"rust-overlay": "rust-overlay"
|
||||
}
|
||||
},
|
||||
"rust-analyzer-src": {
|
||||
"flake": false,
|
||||
"locked": {
|
||||
"lastModified": 1771007332,
|
||||
"owner": "rust-lang",
|
||||
"repo": "rust-analyzer",
|
||||
"rev": "bbc84d335fbbd9b3099d3e40c7469ee57dbd1873",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "rust-lang",
|
||||
"ref": "nightly",
|
||||
"repo": "rust-analyzer",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"rust-overlay": {
|
||||
"inputs": {
|
||||
"nixpkgs": [
|
||||
"nixpkgs"
|
||||
]
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1771038269,
|
||||
"owner": "oxalica",
|
||||
"repo": "rust-overlay",
|
||||
"rev": "d7a86c8a4df49002446737603a3e0d7ef91a9637",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "oxalica",
|
||||
"repo": "rust-overlay",
|
||||
"type": "github"
|
||||
}
|
||||
}
|
||||
},
|
||||
|
||||
29
devenv.nix
29
devenv.nix
@@ -10,16 +10,45 @@
|
||||
packages = [
|
||||
pkgs.openssl
|
||||
pkgs.nodejs
|
||||
pkgs.watchman
|
||||
pkgs.vscode-langservers-extracted
|
||||
pkgs.cargo-dist
|
||||
pkgs.cargo-xwin
|
||||
pkgs.clang
|
||||
pkgs.onnxruntime
|
||||
pkgs.cargo-watch
|
||||
pkgs.tailwindcss_4
|
||||
];
|
||||
|
||||
languages.rust = {
|
||||
enable = true;
|
||||
components = ["rustc" "clippy" "rustfmt" "cargo" "rust-analyzer"];
|
||||
channel = "nightly";
|
||||
targets = ["x86_64-unknown-linux-gnu" "x86_64-pc-windows-msvc"];
|
||||
mold.enable = true;
|
||||
};
|
||||
|
||||
env = {
|
||||
ORT_DYLIB_PATH = "${pkgs.onnxruntime}/lib/libonnxruntime.so";
|
||||
S3_ENDPOINT = "http://127.0.0.1:19000";
|
||||
S3_BUCKET = "minne-tests";
|
||||
MINNE_TEST_S3_ENDPOINT = "http://127.0.0.1:19000";
|
||||
MINNE_TEST_S3_BUCKET = "minne-tests";
|
||||
};
|
||||
|
||||
services.minio = {
|
||||
enable = true;
|
||||
listenAddress = "127.0.0.1:19000";
|
||||
consoleAddress = "127.0.0.1:19001";
|
||||
buckets = ["minne-tests"];
|
||||
accessKey = "minioadmin";
|
||||
secretKey = "minioadmin";
|
||||
region = "us-east-1";
|
||||
};
|
||||
|
||||
processes = {
|
||||
surreal_db.exec = "docker run --rm --pull always -p 8000:8000 --net=host --user $(id -u) -v $(pwd)/database:/database surrealdb/surrealdb:latest-dev start rocksdb:/database/database.db --user root_user --pass root_password";
|
||||
server.exec = "cargo watch -x 'run --bin main'";
|
||||
tailwind.exec = "tailwindcss --cwd html-router -i app.css -o assets/style.css --watch=always";
|
||||
};
|
||||
}
|
||||
|
||||
18
devenv.yaml
18
devenv.yaml
@@ -1,15 +1,11 @@
|
||||
# yaml-language-server: $schema=https://devenv.sh/devenv.schema.json
|
||||
inputs:
|
||||
fenix:
|
||||
url: github:nix-community/fenix
|
||||
nixpkgs:
|
||||
url: github:nixos/nixpkgs/nixpkgs-unstable
|
||||
|
||||
# If you're using non-OSS software, you can set allowUnfree to true.
|
||||
rust-overlay:
|
||||
url: github:oxalica/rust-overlay
|
||||
inputs:
|
||||
nixpkgs:
|
||||
follows: nixpkgs
|
||||
allowUnfree: true
|
||||
|
||||
# If you're willing to use a package that's vulnerable
|
||||
# permittedInsecurePackages:
|
||||
# - "openssl-1.1.1w"
|
||||
|
||||
# If you have more than one devenv you can merge them
|
||||
#imports:
|
||||
# - ./backend
|
||||
|
||||
@@ -4,17 +4,21 @@ members = ["cargo:."]
|
||||
# Config for 'dist'
|
||||
[dist]
|
||||
# The preferred dist version to use in CI (Cargo.toml SemVer syntax)
|
||||
cargo-dist-version = "0.28.0"
|
||||
cargo-dist-version = "0.30.0"
|
||||
# CI backends to support
|
||||
ci = "github"
|
||||
# Extra static files to include in each App (path relative to this Cargo.toml's dir)
|
||||
include = ["lib"]
|
||||
# The installers to generate for each app
|
||||
installers = []
|
||||
# Target platforms to build apps for (Rust target-triple syntax)
|
||||
targets = ["aarch64-apple-darwin", "x86_64-unknown-linux-gnu", "x86_64-pc-windows-msvc"]
|
||||
targets = ["aarch64-apple-darwin", "x86_64-apple-darwin", "x86_64-unknown-linux-gnu", "x86_64-pc-windows-msvc"]
|
||||
# Skip checking whether the specified configuration files are up to date
|
||||
allow-dirty = ["ci"]
|
||||
|
||||
[dist.github-custom-runners]
|
||||
aarch64-apple-darwin = "macos-latest"
|
||||
x86_64-apple-darwin = "macos-15-intel"
|
||||
x86_64-unknown-linux-gnu = "ubuntu-22.04"
|
||||
x86_64-unknown-linux-musl = "ubuntu-22.04"
|
||||
x86_64-pc-windows-msvc = "windows-latest"
|
||||
|
||||
@@ -1,5 +1,3 @@
|
||||
version: '3.8'
|
||||
|
||||
services:
|
||||
minne:
|
||||
build: .
|
||||
@@ -12,10 +10,11 @@ services:
|
||||
SURREALDB_PASSWORD: "root_password"
|
||||
SURREALDB_DATABASE: "test"
|
||||
SURREALDB_NAMESPACE: "test"
|
||||
OPENAI_API_KEY: "sk-key"
|
||||
OPENAI_API_KEY: "sk-add-your-key"
|
||||
DATA_DIR: "./data"
|
||||
HTTP_PORT: 3000
|
||||
RUST_LOG: "info"
|
||||
RERANKING_ENABLED: false ## Change to true to enable reranking
|
||||
depends_on:
|
||||
- surrealdb
|
||||
networks:
|
||||
@@ -31,7 +30,7 @@ services:
|
||||
- ./database:/database # Mounts a 'database' folder from your project directory
|
||||
command: >
|
||||
start
|
||||
--log debug
|
||||
--log info
|
||||
--user root_user
|
||||
--pass root_password
|
||||
rocksdb:./database/database.db
|
||||
|
||||
74
docs/architecture.md
Normal file
74
docs/architecture.md
Normal file
@@ -0,0 +1,74 @@
|
||||
# Architecture
|
||||
|
||||
## Tech Stack
|
||||
|
||||
| Layer | Technology |
|
||||
|-------|------------|
|
||||
| Backend | Rust with Axum (SSR) |
|
||||
| Frontend | HTML + HTMX + minimal JS |
|
||||
| Database | SurrealDB (graph, document, vector) |
|
||||
| AI | OpenAI-compatible API |
|
||||
| Web Processing | Headless Chromium |
|
||||
|
||||
## Crate Structure
|
||||
|
||||
```
|
||||
minne/
|
||||
├── main/ # Combined server + worker binary
|
||||
├── api-router/ # REST API routes
|
||||
├── html-router/ # SSR web interface
|
||||
├── ingestion-pipeline/ # Content processing pipeline
|
||||
├── retrieval-pipeline/ # Search and retrieval logic
|
||||
├── common/ # Shared types, storage, utilities
|
||||
├── evaluations/ # Benchmarking framework
|
||||
└── json-stream-parser/ # Streaming JSON utilities
|
||||
```
|
||||
|
||||
## Process Modes
|
||||
|
||||
| Binary | Purpose |
|
||||
|--------|---------|
|
||||
| `main` | All-in-one: serves UI and processes content |
|
||||
| `server` | UI and API only (no background processing) |
|
||||
| `worker` | Background processing only (no UI) |
|
||||
|
||||
Split deployment is useful for scaling or resource isolation.
|
||||
|
||||
## Data Flow
|
||||
|
||||
```
|
||||
Content In → Ingestion Pipeline → SurrealDB
|
||||
↓
|
||||
Entity Extraction
|
||||
↓
|
||||
Embedding Generation
|
||||
↓
|
||||
Graph Relationships
|
||||
|
||||
Query → Retrieval Pipeline → Results
|
||||
↓
|
||||
Vector Search + FTS
|
||||
↓
|
||||
RRF Fusion → (Optional Rerank) → Response
|
||||
```
|
||||
|
||||
## Database Schema
|
||||
|
||||
SurrealDB stores:
|
||||
|
||||
- **TextContent** — Raw ingested content
|
||||
- **TextChunk** — Chunked content with embeddings
|
||||
- **KnowledgeEntity** — Extracted entities (people, concepts, etc.)
|
||||
- **KnowledgeRelationship** — Connections between entities
|
||||
- **User** — Authentication and preferences
|
||||
- **SystemSettings** — Model configuration
|
||||
|
||||
Embeddings are stored in dedicated tables with HNSW indexes for fast vector search.
|
||||
|
||||
## Retrieval Strategy
|
||||
|
||||
1. **Collect candidates** — Vector similarity + full-text search
|
||||
2. **Merge ranks** — Reciprocal Rank Fusion (RRF)
|
||||
3. **Attach context** — Link chunks to parent entities
|
||||
4. **Rerank** (optional) — Cross-encoder reranking
|
||||
5. **Return** — Top-k results with metadata
|
||||
117
docs/configuration.md
Normal file
117
docs/configuration.md
Normal file
@@ -0,0 +1,117 @@
|
||||
# Configuration
|
||||
|
||||
Minne can be configured via environment variables or a `config.yaml` file. Environment variables take precedence.
|
||||
|
||||
## Required Settings
|
||||
|
||||
| Variable | Description | Example |
|
||||
|----------|-------------|---------|
|
||||
| `OPENAI_API_KEY` | API key for OpenAI-compatible endpoint | `sk-...` |
|
||||
| `SURREALDB_ADDRESS` | WebSocket address of SurrealDB | `ws://127.0.0.1:8000` |
|
||||
| `SURREALDB_USERNAME` | SurrealDB username | `root_user` |
|
||||
| `SURREALDB_PASSWORD` | SurrealDB password | `root_password` |
|
||||
| `SURREALDB_DATABASE` | Database name | `minne_db` |
|
||||
| `SURREALDB_NAMESPACE` | Namespace | `minne_ns` |
|
||||
|
||||
|
||||
## Optional Settings
|
||||
|
||||
| Variable | Description | Default |
|
||||
|----------|-------------|---------|
|
||||
| `HTTP_PORT` | Server port | `3000` |
|
||||
| `DATA_DIR` | Local data directory | `./data` |
|
||||
| `OPENAI_BASE_URL` | Custom AI provider URL | OpenAI default |
|
||||
| `RUST_LOG` | Logging level | `info` |
|
||||
| `STORAGE` | Storage backend (`local`, `memory`, `s3`) | `local` |
|
||||
| `PDF_INGEST_MODE` | PDF ingestion strategy (`classic`, `llm-first`) | `llm-first` |
|
||||
| `RETRIEVAL_STRATEGY` | Default retrieval strategy | - |
|
||||
| `EMBEDDING_BACKEND` | Embedding provider (`openai`, `fastembed`) | `fastembed` |
|
||||
| `FASTEMBED_CACHE_DIR` | Model cache directory | `<data_dir>/fastembed` |
|
||||
| `FASTEMBED_SHOW_DOWNLOAD_PROGRESS` | Show progress bar for model downloads | `false` |
|
||||
| `FASTEMBED_MAX_LENGTH` | Max sequence length for FastEmbed models | - |
|
||||
| `INGEST_MAX_BODY_BYTES` | Max request body size for ingest endpoints | `20000000` |
|
||||
| `INGEST_MAX_FILES` | Max files allowed per ingest request | `5` |
|
||||
| `INGEST_MAX_CONTENT_BYTES` | Max `content` field size for ingest requests | `262144` |
|
||||
| `INGEST_MAX_CONTEXT_BYTES` | Max `context` field size for ingest requests | `16384` |
|
||||
| `INGEST_MAX_CATEGORY_BYTES` | Max `category` field size for ingest requests | `128` |
|
||||
|
||||
### S3 Storage (Optional)
|
||||
|
||||
Used when `STORAGE` is set to `s3`.
|
||||
|
||||
| Variable | Description | Default |
|
||||
|----------|-------------|---------|
|
||||
| `S3_BUCKET` | S3 bucket name | - |
|
||||
| `S3_ENDPOINT` | Custom endpoint (e.g. MinIO) | AWS default |
|
||||
| `S3_REGION` | AWS Region | `us-east-1` |
|
||||
| `AWS_ACCESS_KEY_ID` | Access key | - |
|
||||
| `AWS_SECRET_ACCESS_KEY` | Secret key | - |
|
||||
|
||||
### Reranking (Optional)
|
||||
|
||||
| Variable | Description | Default |
|
||||
|----------|-------------|---------|
|
||||
| `RERANKING_ENABLED` | Enable FastEmbed reranking | `false` |
|
||||
| `RERANKING_POOL_SIZE` | Concurrent reranker workers | - |
|
||||
|
||||
> [!NOTE]
|
||||
> Enabling reranking downloads ~1.1 GB of model data on first startup.
|
||||
|
||||
## Example config.yaml
|
||||
|
||||
```yaml
|
||||
surrealdb_address: "ws://127.0.0.1:8000"
|
||||
surrealdb_username: "root_user"
|
||||
surrealdb_password: "root_password"
|
||||
surrealdb_database: "minne_db"
|
||||
surrealdb_namespace: "minne_ns"
|
||||
openai_api_key: "sk-your-key-here"
|
||||
data_dir: "./minne_data"
|
||||
http_port: 3000
|
||||
|
||||
# New settings
|
||||
storage: "local"
|
||||
# storage: "s3"
|
||||
# s3_bucket: "my-bucket"
|
||||
# s3_endpoint: "http://localhost:9000" # Optional, for MinIO etc.
|
||||
# s3_region: "us-east-1"
|
||||
pdf_ingest_mode: "llm-first"
|
||||
embedding_backend: "fastembed"
|
||||
|
||||
# Optional reranking
|
||||
reranking_enabled: true
|
||||
reranking_pool_size: 2
|
||||
|
||||
# Ingest safety limits
|
||||
ingest_max_body_bytes: 20000000
|
||||
ingest_max_files: 5
|
||||
ingest_max_content_bytes: 262144
|
||||
ingest_max_context_bytes: 16384
|
||||
ingest_max_category_bytes: 128
|
||||
```
|
||||
|
||||
## AI Provider Setup
|
||||
|
||||
Minne works with any OpenAI-compatible API that supports structured outputs.
|
||||
|
||||
### OpenAI (Default)
|
||||
|
||||
Set `OPENAI_API_KEY` only. The default base URL points to OpenAI.
|
||||
|
||||
### Ollama
|
||||
|
||||
```bash
|
||||
OPENAI_API_KEY="ollama"
|
||||
OPENAI_BASE_URL="http://localhost:11434/v1"
|
||||
```
|
||||
|
||||
### Other Providers
|
||||
|
||||
Any provider exposing an OpenAI-compatible endpoint works. Set `OPENAI_BASE_URL` accordingly.
|
||||
|
||||
## Model Selection
|
||||
|
||||
1. Access `/admin` in your Minne instance
|
||||
2. Select models for content processing and chat
|
||||
3. **Content Processing**: Must support structured outputs
|
||||
4. **Embedding Dimensions**: Update when changing embedding models (e.g., 1536 for `text-embedding-3-small`)
|
||||
64
docs/features.md
Normal file
64
docs/features.md
Normal file
@@ -0,0 +1,64 @@
|
||||
# Features
|
||||
|
||||
## Search vs Chat
|
||||
|
||||
**Search** — Use when you know what you're looking for. Full-text search matches query terms across your content.
|
||||
|
||||
**Chat** — Use when exploring concepts or reasoning about your knowledge. The AI analyzes your query and retrieves relevant context from your entire knowledge base.
|
||||
|
||||
## Content Processing
|
||||
|
||||
Minne automatically processes saved content:
|
||||
|
||||
1. **Web scraping** extracts readable text from URLs (via headless Chrome)
|
||||
2. **Text analysis** identifies key concepts and relationships
|
||||
3. **Graph creation** builds connections between related content
|
||||
4. **Embedding generation** enables semantic search
|
||||
|
||||
## Knowledge Graph
|
||||
|
||||
Explore your knowledge as an interactive network:
|
||||
|
||||
- **Manual curation** — Create entities and relationships yourself
|
||||
- **AI automation** — Let AI extract entities and discover relationships
|
||||
- **Hybrid approach** — AI suggests connections for your approval
|
||||
|
||||
The D3-based graph visualization shows entities as nodes and relationships as edges.
|
||||
|
||||
## Hybrid Retrieval
|
||||
|
||||
Minne combines multiple retrieval strategies:
|
||||
|
||||
- **Vector similarity** — Semantic matching via embeddings
|
||||
- **Full-text search** — Keyword matching with BM25
|
||||
- **Graph traversal** — Following relationships between entities
|
||||
|
||||
Results are merged using Reciprocal Rank Fusion (RRF) for optimal relevance.
|
||||
|
||||
## Reranking (Optional)
|
||||
|
||||
When enabled, retrieval results are rescored with a cross-encoder model for improved relevance. Powered by [fastembed-rs](https://github.com/Anush008/fastembed-rs).
|
||||
|
||||
**Trade-offs:**
|
||||
- Downloads ~1.1 GB of model data
|
||||
- Adds latency per query
|
||||
- Potentially improves answer quality, see [blog post](https://blog.stark.pub/posts/eval-retrieval-refactor/)
|
||||
|
||||
Enable via `RERANKING_ENABLED=true`. See [Configuration](./configuration.md).
|
||||
|
||||
## Multi-Format Ingestion
|
||||
|
||||
Supported content types:
|
||||
- Plain text and notes
|
||||
- URLs (web pages)
|
||||
- PDF documents
|
||||
- Audio files
|
||||
- Images
|
||||
|
||||
## Scratchpad
|
||||
|
||||
Quickly capture content without committing to permanent storage. Convert to full content when ready.
|
||||
|
||||
## iOS Shortcut
|
||||
|
||||
Use the [Minne iOS Shortcut](https://www.icloud.com/shortcuts/e433fbd7602f4e2eaa70dca162323477) for quick content capture from your phone.
|
||||
67
docs/installation.md
Normal file
67
docs/installation.md
Normal file
@@ -0,0 +1,67 @@
|
||||
# Installation
|
||||
|
||||
Minne can be installed through several methods. Choose the one that best fits your setup.
|
||||
|
||||
## Docker Compose (Recommended)
|
||||
|
||||
The fastest way to get Minne running with all dependencies:
|
||||
|
||||
```bash
|
||||
git clone https://github.com/perstarkse/minne.git
|
||||
cd minne
|
||||
docker compose up -d
|
||||
```
|
||||
|
||||
The included `docker-compose.yml` handles SurrealDB and Chromium automatically.
|
||||
|
||||
**Required:** Set your `OPENAI_API_KEY` in `docker-compose.yml` before starting.
|
||||
|
||||
## Nix
|
||||
|
||||
Run Minne directly with Nix (includes Chromium):
|
||||
|
||||
```bash
|
||||
nix run 'github:perstarkse/minne#main'
|
||||
```
|
||||
|
||||
Configure via environment variables or a `config.yaml` file. See [Configuration](./configuration.md).
|
||||
|
||||
## Pre-built Binaries
|
||||
|
||||
Download binaries for Windows, macOS, and Linux from [GitHub Releases](https://github.com/perstarkse/minne/releases/latest).
|
||||
|
||||
**Requirements:**
|
||||
- SurrealDB instance (local or remote)
|
||||
- Chromium (for web scraping)
|
||||
|
||||
## Build from Source
|
||||
|
||||
```bash
|
||||
git clone https://github.com/perstarkse/minne.git
|
||||
cd minne
|
||||
cargo build --release --bin main
|
||||
```
|
||||
|
||||
The binary will be at `target/release/main`.
|
||||
|
||||
**Requirements:**
|
||||
- Rust toolchain
|
||||
- SurrealDB accessible at configured address
|
||||
- Chromium in PATH
|
||||
|
||||
## Process Modes
|
||||
|
||||
Minne offers flexible deployment:
|
||||
|
||||
| Binary | Description |
|
||||
|--------|-------------|
|
||||
| `main` | Combined server + worker (recommended) |
|
||||
| `server` | Web interface and API only |
|
||||
| `worker` | Background processing only |
|
||||
|
||||
For most users, `main` is the right choice. Split deployments are useful for resource optimization or scaling.
|
||||
|
||||
## Next Steps
|
||||
|
||||
- [Configuration](./configuration.md) — Environment variables and config.yaml
|
||||
- [Features](./features.md) — What Minne can do
|
||||
48
docs/vision.md
Normal file
48
docs/vision.md
Normal file
@@ -0,0 +1,48 @@
|
||||
# Vision
|
||||
|
||||
## The "Why" Behind Minne
|
||||
|
||||
Personal knowledge management has always fascinated me. I wanted something that made it incredibly easy to capture content—snippets of text, URLs, media—while automatically discovering connections between ideas. But I also wanted control over my knowledge structure.
|
||||
|
||||
Traditional tools like Logseq and Obsidian are excellent, but manual linking often becomes a hindrance. Fully automated systems sometimes miss important context or create relationships I wouldn't have chosen.
|
||||
|
||||
Minne offers the best of both worlds: effortless capture with AI-assisted relationship discovery, but with flexibility to manually curate, edit, or override connections. Let AI handle the heavy lifting, take full control yourself, or use a hybrid approach where AI suggests and you approve.
|
||||
|
||||
## Design Principles
|
||||
|
||||
- **Capture should be instant** — No friction between thought and storage
|
||||
- **Connections should emerge** — AI finds relationships you might miss
|
||||
- **Control should be optional** — Automate by default, curate when it matters
|
||||
- **Privacy should be default** — Self-hosted, your data stays yours
|
||||
|
||||
## Roadmap
|
||||
|
||||
### Near-term
|
||||
|
||||
- [ ] TUI frontend with system editor integration
|
||||
- [ ] Enhanced retrieval recall via improved reranking
|
||||
- [ ] Additional content type support (e-books, research papers)
|
||||
|
||||
### Medium-term
|
||||
|
||||
- [ ] Embedded SurrealDB option (zero-config `nix run` with just `OPENAI_API_KEY`)
|
||||
- [ ] Browser extension for seamless capture
|
||||
- [ ] Mobile-native apps
|
||||
|
||||
### Long-term
|
||||
|
||||
- [ ] Federated knowledge sharing (opt-in)
|
||||
- [ ] Local LLM integration (fully offline operation)
|
||||
- [ ] Plugin system for custom entity extractors
|
||||
|
||||
## Related Projects
|
||||
|
||||
If Minne isn't quite right for you, check out:
|
||||
|
||||
- [Karakeep](https://github.com/karakeep-app/karakeep) (formerly Hoarder) — Excellent bookmark/read-later with AI tagging
|
||||
- [Logseq](https://logseq.com/) — Outliner-based PKM with manual linking
|
||||
- [Obsidian](https://obsidian.md/) — Markdown-based PKM with plugin ecosystem
|
||||
|
||||
## Contributing
|
||||
|
||||
Feature requests and contributions are welcome. Minne was built for personal use first, but the self-hosted community benefits when we share.
|
||||
36
evaluations/Cargo.toml
Normal file
36
evaluations/Cargo.toml
Normal file
@@ -0,0 +1,36 @@
|
||||
[package]
|
||||
name = "evaluations"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
anyhow = { workspace = true }
|
||||
async-openai = { workspace = true }
|
||||
chrono = { workspace = true }
|
||||
common = { path = "../common" }
|
||||
retrieval-pipeline = { path = "../retrieval-pipeline" }
|
||||
ingestion-pipeline = { path = "../ingestion-pipeline" }
|
||||
futures = { workspace = true }
|
||||
fastembed = { workspace = true }
|
||||
serde = { workspace = true, features = ["derive"] }
|
||||
tokio = { workspace = true, features = ["macros", "rt-multi-thread"] }
|
||||
tracing = { workspace = true }
|
||||
tracing-subscriber = { workspace = true }
|
||||
uuid = { workspace = true }
|
||||
text-splitter = { workspace = true }
|
||||
unicode-normalization = { workspace = true }
|
||||
rand = "0.8"
|
||||
sha2 = { workspace = true }
|
||||
object_store = { workspace = true }
|
||||
surrealdb = { workspace = true }
|
||||
serde_json = { workspace = true }
|
||||
async-trait = { workspace = true }
|
||||
once_cell = "1.19"
|
||||
serde_yaml = "0.9"
|
||||
criterion = "0.5"
|
||||
state-machines = { workspace = true }
|
||||
clap = { version = "4.4", features = ["derive", "env"] }
|
||||
|
||||
[dev-dependencies]
|
||||
tempfile = { workspace = true }
|
||||
common = { path = "../common", features = ["test-utils"] }
|
||||
212
evaluations/README.md
Normal file
212
evaluations/README.md
Normal file
@@ -0,0 +1,212 @@
|
||||
# Evaluations
|
||||
|
||||
The `evaluations` crate provides a retrieval evaluation framework for benchmarking Minne's information retrieval pipeline against standard datasets.
|
||||
|
||||
## Quick Start
|
||||
|
||||
```bash
|
||||
# Run SQuAD v2.0 evaluation (vector-only, recommended)
|
||||
cargo run --package evaluations -- --ingest-chunks-only
|
||||
|
||||
# Run a specific dataset
|
||||
cargo run --package evaluations -- --dataset fiqa --ingest-chunks-only
|
||||
|
||||
# Convert dataset only (no evaluation)
|
||||
cargo run --package evaluations -- --convert-only
|
||||
```
|
||||
|
||||
## Prerequisites
|
||||
|
||||
### 1. SurrealDB
|
||||
|
||||
Start a SurrealDB instance before running evaluations:
|
||||
|
||||
```bash
|
||||
docker-compose up -d surrealdb
|
||||
```
|
||||
|
||||
Or using the default endpoint configuration:
|
||||
|
||||
```bash
|
||||
surreal start --user root_user --pass root_password
|
||||
```
|
||||
|
||||
### 2. Download Raw Datasets
|
||||
|
||||
Raw datasets must be downloaded manually and placed in `evaluations/data/raw/`. See [Dataset Sources](#dataset-sources) below for links and formats.
|
||||
|
||||
## Directory Structure
|
||||
|
||||
```
|
||||
evaluations/
|
||||
├── data/
|
||||
│ ├── raw/ # Downloaded raw datasets (manual)
|
||||
│ │ ├── squad/ # SQuAD v2.0
|
||||
│ │ ├── nq-dev/ # Natural Questions
|
||||
│ │ ├── fiqa/ # BEIR: FiQA-2018
|
||||
│ │ ├── fever/ # BEIR: FEVER
|
||||
│ │ ├── hotpotqa/ # BEIR: HotpotQA
|
||||
│ │ └── ... # Other BEIR subsets
|
||||
│ └── converted/ # Auto-generated (Minne JSON format)
|
||||
├── cache/ # Ingestion and embedding caches
|
||||
├── reports/ # Evaluation output (JSON + Markdown)
|
||||
├── manifest.yaml # Dataset and slice definitions
|
||||
└── src/ # Evaluation source code
|
||||
```
|
||||
|
||||
## Dataset Sources
|
||||
|
||||
### SQuAD v2.0
|
||||
|
||||
Download and place at `data/raw/squad/dev-v2.0.json`:
|
||||
|
||||
```bash
|
||||
mkdir -p evaluations/data/raw/squad
|
||||
curl -L https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json \
|
||||
-o evaluations/data/raw/squad/dev-v2.0.json
|
||||
```
|
||||
|
||||
### Natural Questions (NQ)
|
||||
|
||||
Download and place at `data/raw/nq-dev/dev-all.jsonl`:
|
||||
|
||||
```bash
|
||||
mkdir -p evaluations/data/raw/nq-dev
|
||||
# Download from Google's Natural Questions page or HuggingFace
|
||||
# File: dev-all.jsonl (simplified JSONL format)
|
||||
```
|
||||
|
||||
Source: [Google Natural Questions](https://ai.google.com/research/NaturalQuestions)
|
||||
|
||||
### BEIR Datasets
|
||||
|
||||
All BEIR datasets follow the same format structure:
|
||||
|
||||
```
|
||||
data/raw/<dataset>/
|
||||
├── corpus.jsonl # Document corpus
|
||||
├── queries.jsonl # Query set
|
||||
└── qrels/
|
||||
└── test.tsv # Relevance judgments (or dev.tsv)
|
||||
```
|
||||
|
||||
Download datasets from the [BEIR Benchmark repository](https://github.com/beir-cellar/beir). Each dataset zip extracts to the required directory structure.
|
||||
|
||||
| Dataset | Directory |
|
||||
|------------|---------------|
|
||||
| FEVER | `fever/` |
|
||||
| FiQA-2018 | `fiqa/` |
|
||||
| HotpotQA | `hotpotqa/` |
|
||||
| NFCorpus | `nfcorpus/` |
|
||||
| Quora | `quora/` |
|
||||
| TREC-COVID | `trec-covid/` |
|
||||
| SciFact | `scifact/` |
|
||||
| NQ (BEIR) | `nq/` |
|
||||
|
||||
Example download:
|
||||
|
||||
```bash
|
||||
cd evaluations/data/raw
|
||||
curl -L https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/fiqa.zip -o fiqa.zip
|
||||
unzip fiqa.zip && rm fiqa.zip
|
||||
```
|
||||
|
||||
## Dataset Conversion
|
||||
|
||||
Raw datasets are automatically converted to Minne's internal JSON format on first run. To force reconversion:
|
||||
|
||||
```bash
|
||||
cargo run --package evaluations -- --force-convert
|
||||
```
|
||||
|
||||
Converted files are saved to `data/converted/` and cached for subsequent runs.
|
||||
|
||||
## CLI Reference
|
||||
|
||||
### Common Options
|
||||
|
||||
| Flag | Description | Default |
|
||||
|------|-------------|---------|
|
||||
| `--dataset <NAME>` | Dataset to evaluate | `squad-v2` |
|
||||
| `--limit <N>` | Max questions to evaluate (0 = all) | `200` |
|
||||
| `--k <N>` | Precision@k cutoff | `5` |
|
||||
| `--slice <ID>` | Use a predefined slice from manifest | — |
|
||||
| `--rerank` | Enable FastEmbed reranking stage | disabled |
|
||||
| `--embedding-backend <BE>` | `fastembed` or `hashed` | `fastembed` |
|
||||
| `--ingest-chunks-only` | Skip entity extraction, ingest only text chunks | disabled |
|
||||
|
||||
> [!TIP]
|
||||
> Use `--ingest-chunks-only` when evaluating vector-only retrieval strategies. This skips the LLM-based entity extraction and graph generation, significantly speeding up ingestion while focusing on pure chunk-based vector search.
|
||||
|
||||
### Available Datasets
|
||||
|
||||
```
|
||||
squad-v2, natural-questions, beir, fever, fiqa, hotpotqa,
|
||||
nfcorpus, quora, trec-covid, scifact, nq-beir
|
||||
```
|
||||
|
||||
### Database Configuration
|
||||
|
||||
| Flag | Environment | Default |
|
||||
|------|-------------|---------|
|
||||
| `--db-endpoint` | `EVAL_DB_ENDPOINT` | `ws://127.0.0.1:8000` |
|
||||
| `--db-username` | `EVAL_DB_USERNAME` | `root_user` |
|
||||
| `--db-password` | `EVAL_DB_PASSWORD` | `root_password` |
|
||||
| `--db-namespace` | `EVAL_DB_NAMESPACE` | auto-generated |
|
||||
| `--db-database` | `EVAL_DB_DATABASE` | auto-generated |
|
||||
|
||||
### Example Runs
|
||||
|
||||
```bash
|
||||
# Vector-only evaluation (recommended for benchmarking)
|
||||
cargo run --package evaluations -- \
|
||||
--dataset fiqa \
|
||||
--ingest-chunks-only \
|
||||
--limit 200
|
||||
|
||||
# Full FiQA evaluation with reranking
|
||||
cargo run --package evaluations -- \
|
||||
--dataset fiqa \
|
||||
--ingest-chunks-only \
|
||||
--limit 500 \
|
||||
--rerank \
|
||||
--k 10
|
||||
|
||||
# Use a predefined slice for reproducibility
|
||||
cargo run --package evaluations -- --slice fiqa-test-200 --ingest-chunks-only
|
||||
|
||||
# Run the mixed BEIR benchmark
|
||||
cargo run --package evaluations -- --dataset beir --slice beir-mix-600 --ingest-chunks-only
|
||||
```
|
||||
|
||||
## Slices
|
||||
|
||||
Slices are predefined, reproducible subsets defined in `manifest.yaml`. Each slice specifies:
|
||||
|
||||
- **limit**: Number of questions
|
||||
- **corpus_limit**: Maximum corpus size
|
||||
- **seed**: Fixed RNG seed for reproducibility
|
||||
|
||||
View available slices in [manifest.yaml](./manifest.yaml).
|
||||
|
||||
## Reports
|
||||
|
||||
Evaluations generate reports in `reports/`:
|
||||
|
||||
- **JSON**: Full structured results (`*-report.json`)
|
||||
- **Markdown**: Human-readable summary with sample mismatches (`*-report.md`)
|
||||
- **History**: Timestamped run history (`history/`)
|
||||
|
||||
## Performance Tuning
|
||||
|
||||
```bash
|
||||
# Log per-stage performance timings
|
||||
cargo run --package evaluations -- --perf-log-console
|
||||
|
||||
# Save telemetry to file
|
||||
cargo run --package evaluations -- --perf-log-json ./perf.json
|
||||
```
|
||||
|
||||
## License
|
||||
|
||||
See [../LICENSE](../LICENSE).
|
||||
168
evaluations/manifest.yaml
Normal file
168
evaluations/manifest.yaml
Normal file
@@ -0,0 +1,168 @@
|
||||
default_dataset: squad-v2
|
||||
datasets:
|
||||
- id: squad-v2
|
||||
label: "SQuAD v2.0"
|
||||
category: "SQuAD v2.0"
|
||||
entity_suffix: "SQuAD"
|
||||
source_prefix: "squad"
|
||||
raw: "data/raw/squad/dev-v2.0.json"
|
||||
converted: "data/converted/squad-minne.json"
|
||||
include_unanswerable: false
|
||||
slices:
|
||||
- id: squad-dev-200
|
||||
label: "SQuAD dev (200)"
|
||||
description: "Deterministic 200-case slice for local eval"
|
||||
limit: 200
|
||||
corpus_limit: 2000
|
||||
seed: 0x5eed2025
|
||||
- id: natural-questions-dev
|
||||
label: "Natural Questions (dev)"
|
||||
category: "Natural Questions"
|
||||
entity_suffix: "Natural Questions"
|
||||
source_prefix: "nq"
|
||||
raw: "data/raw/nq-dev/dev-all.jsonl"
|
||||
converted: "data/converted/nq-dev-minne.json"
|
||||
include_unanswerable: true
|
||||
slices:
|
||||
- id: nq-dev-200
|
||||
label: "NQ dev (200)"
|
||||
description: "200-case slice of the dev set"
|
||||
limit: 200
|
||||
corpus_limit: 2000
|
||||
include_unanswerable: false
|
||||
seed: 0x5eed2025
|
||||
- id: beir
|
||||
label: "BEIR mix"
|
||||
category: "BEIR"
|
||||
entity_suffix: "BEIR"
|
||||
source_prefix: "beir"
|
||||
raw: "data/raw/beir"
|
||||
converted: "data/converted/beir-minne.json"
|
||||
include_unanswerable: false
|
||||
slices:
|
||||
- id: beir-mix-600
|
||||
label: "BEIR mix (600)"
|
||||
description: "Balanced slice across FEVER, FiQA, HotpotQA, NFCorpus, Quora, TREC-COVID, SciFact, NQ-BEIR"
|
||||
limit: 600
|
||||
corpus_limit: 6000
|
||||
seed: 0x5eed2025
|
||||
- id: fever
|
||||
label: "FEVER (BEIR)"
|
||||
category: "FEVER"
|
||||
entity_suffix: "FEVER"
|
||||
source_prefix: "fever"
|
||||
raw: "data/raw/fever"
|
||||
converted: "data/converted/fever-minne.json"
|
||||
include_unanswerable: false
|
||||
slices:
|
||||
- id: fever-test-200
|
||||
label: "FEVER test (200)"
|
||||
description: "200-case slice from BEIR test qrels"
|
||||
limit: 200
|
||||
corpus_limit: 5000
|
||||
seed: 0x5eed2025
|
||||
- id: fiqa
|
||||
label: "FiQA-2018 (BEIR)"
|
||||
category: "FiQA-2018"
|
||||
entity_suffix: "FiQA"
|
||||
source_prefix: "fiqa"
|
||||
raw: "data/raw/fiqa"
|
||||
converted: "data/converted/fiqa-minne.json"
|
||||
include_unanswerable: false
|
||||
slices:
|
||||
- id: fiqa-test-200
|
||||
label: "FiQA test (200)"
|
||||
description: "200-case slice from BEIR test qrels"
|
||||
limit: 200
|
||||
corpus_limit: 5000
|
||||
seed: 0x5eed2025
|
||||
- id: hotpotqa
|
||||
label: "HotpotQA (BEIR)"
|
||||
category: "HotpotQA"
|
||||
entity_suffix: "HotpotQA"
|
||||
source_prefix: "hotpotqa"
|
||||
raw: "data/raw/hotpotqa"
|
||||
converted: "data/converted/hotpotqa-minne.json"
|
||||
include_unanswerable: false
|
||||
slices:
|
||||
- id: hotpotqa-test-200
|
||||
label: "HotpotQA test (200)"
|
||||
description: "200-case slice from BEIR test qrels"
|
||||
limit: 200
|
||||
corpus_limit: 5000
|
||||
seed: 0x5eed2025
|
||||
- id: nfcorpus
|
||||
label: "NFCorpus (BEIR)"
|
||||
category: "NFCorpus"
|
||||
entity_suffix: "NFCorpus"
|
||||
source_prefix: "nfcorpus"
|
||||
raw: "data/raw/nfcorpus"
|
||||
converted: "data/converted/nfcorpus-minne.json"
|
||||
include_unanswerable: false
|
||||
slices:
|
||||
- id: nfcorpus-test-200
|
||||
label: "NFCorpus test (200)"
|
||||
description: "200-case slice from BEIR test qrels"
|
||||
limit: 200
|
||||
corpus_limit: 5000
|
||||
seed: 0x5eed2025
|
||||
- id: quora
|
||||
label: "Quora (IR)"
|
||||
category: "Quora"
|
||||
entity_suffix: "Quora"
|
||||
source_prefix: "quora"
|
||||
raw: "data/raw/quora"
|
||||
converted: "data/converted/quora-minne.json"
|
||||
include_unanswerable: false
|
||||
slices:
|
||||
- id: quora-test-200
|
||||
label: "Quora test (200)"
|
||||
description: "200-case slice from BEIR test qrels"
|
||||
limit: 200
|
||||
corpus_limit: 5000
|
||||
seed: 0x5eed2025
|
||||
- id: trec-covid
|
||||
label: "TREC-COVID (BEIR)"
|
||||
category: "TREC-COVID"
|
||||
entity_suffix: "TREC-COVID"
|
||||
source_prefix: "trec-covid"
|
||||
raw: "data/raw/trec-covid"
|
||||
converted: "data/converted/trec-covid-minne.json"
|
||||
include_unanswerable: false
|
||||
slices:
|
||||
- id: trec-covid-test-200
|
||||
label: "TREC-COVID test (200)"
|
||||
description: "200-case slice from BEIR test qrels"
|
||||
limit: 200
|
||||
corpus_limit: 5000
|
||||
seed: 0x5eed2025
|
||||
- id: scifact
|
||||
label: "SciFact (BEIR)"
|
||||
category: "SciFact"
|
||||
entity_suffix: "SciFact"
|
||||
source_prefix: "scifact"
|
||||
raw: "data/raw/scifact"
|
||||
converted: "data/converted/scifact-minne.json"
|
||||
include_unanswerable: false
|
||||
slices:
|
||||
- id: scifact-test-200
|
||||
label: "SciFact test (200)"
|
||||
description: "200-case slice from BEIR test qrels"
|
||||
limit: 200
|
||||
corpus_limit: 3000
|
||||
seed: 0x5eed2025
|
||||
- id: nq-beir
|
||||
label: "Natural Questions (BEIR)"
|
||||
category: "Natural Questions"
|
||||
entity_suffix: "Natural Questions"
|
||||
source_prefix: "nq-beir"
|
||||
raw: "data/raw/nq"
|
||||
converted: "data/converted/nq-beir-minne.json"
|
||||
include_unanswerable: false
|
||||
slices:
|
||||
- id: nq-beir-test-200
|
||||
label: "NQ (BEIR) test (200)"
|
||||
description: "200-case slice from BEIR test qrels"
|
||||
limit: 200
|
||||
corpus_limit: 5000
|
||||
seed: 0x5eed2025
|
||||
506
evaluations/src/args.rs
Normal file
506
evaluations/src/args.rs
Normal file
@@ -0,0 +1,506 @@
|
||||
use std::{
|
||||
env,
|
||||
path::{Path, PathBuf},
|
||||
};
|
||||
|
||||
use anyhow::{anyhow, Context, Result};
|
||||
use clap::{Args, Parser, ValueEnum};
|
||||
use retrieval_pipeline::RetrievalStrategy;
|
||||
|
||||
use crate::datasets::DatasetKind;
|
||||
|
||||
fn workspace_root() -> PathBuf {
|
||||
let manifest_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
|
||||
manifest_dir.parent().unwrap_or(&manifest_dir).to_path_buf()
|
||||
}
|
||||
|
||||
fn default_report_dir() -> PathBuf {
|
||||
workspace_root().join("evaluations/reports")
|
||||
}
|
||||
|
||||
fn default_cache_dir() -> PathBuf {
|
||||
workspace_root().join("evaluations/cache")
|
||||
}
|
||||
|
||||
fn default_ingestion_cache_dir() -> PathBuf {
|
||||
workspace_root().join("evaluations/cache/ingested")
|
||||
}
|
||||
|
||||
pub const DEFAULT_SLICE_SEED: u64 = 0x5eed_2025;
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, ValueEnum, Default)]
|
||||
#[value(rename_all = "lowercase")]
|
||||
pub enum EmbeddingBackend {
|
||||
Hashed,
|
||||
#[default]
|
||||
FastEmbed,
|
||||
}
|
||||
|
||||
impl std::fmt::Display for EmbeddingBackend {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
Self::Hashed => write!(f, "hashed"),
|
||||
Self::FastEmbed => write!(f, "fastembed"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Args)]
|
||||
pub struct RetrievalSettings {
|
||||
/// Override chunk vector candidate cap
|
||||
#[arg(long)]
|
||||
pub chunk_vector_take: Option<usize>,
|
||||
|
||||
/// Override chunk FTS candidate cap
|
||||
#[arg(long)]
|
||||
pub chunk_fts_take: Option<usize>,
|
||||
|
||||
/// Override average characters per token used for budgeting
|
||||
#[arg(long)]
|
||||
pub chunk_avg_chars_per_token: Option<usize>,
|
||||
|
||||
/// Override maximum chunks attached per entity
|
||||
#[arg(long)]
|
||||
pub max_chunks_per_entity: Option<usize>,
|
||||
|
||||
/// Enable the FastEmbed reranking stage
|
||||
#[arg(long = "rerank", action = clap::ArgAction::SetTrue, default_value_t = false)]
|
||||
pub rerank: bool,
|
||||
|
||||
/// Reranking engine pool size / parallelism
|
||||
#[arg(long, default_value_t = 4)]
|
||||
pub rerank_pool_size: usize,
|
||||
|
||||
/// Keep top-N entities after reranking
|
||||
#[arg(long, default_value_t = 10)]
|
||||
pub rerank_keep_top: usize,
|
||||
|
||||
/// Cap the number of chunks returned by retrieval (revised strategy)
|
||||
#[arg(long, default_value_t = 5)]
|
||||
pub chunk_result_cap: usize,
|
||||
|
||||
/// Reciprocal rank fusion k value for revised chunk merging
|
||||
#[arg(long)]
|
||||
pub chunk_rrf_k: Option<f32>,
|
||||
|
||||
/// Weight for vector ranks in revised RRF
|
||||
#[arg(long)]
|
||||
pub chunk_rrf_vector_weight: Option<f32>,
|
||||
|
||||
/// Weight for chunk FTS ranks in revised RRF
|
||||
#[arg(long)]
|
||||
pub chunk_rrf_fts_weight: Option<f32>,
|
||||
|
||||
/// Include vector ranks in revised RRF (default: true)
|
||||
#[arg(long)]
|
||||
pub chunk_rrf_use_vector: Option<bool>,
|
||||
|
||||
/// Include chunk FTS ranks in revised RRF (default: true)
|
||||
#[arg(long)]
|
||||
pub chunk_rrf_use_fts: Option<bool>,
|
||||
|
||||
/// Require verified chunks (disable with --llm-mode)
|
||||
#[arg(skip = true)]
|
||||
pub require_verified_chunks: bool,
|
||||
|
||||
/// Select the retrieval pipeline strategy
|
||||
#[arg(long, default_value_t = RetrievalStrategy::Default)]
|
||||
pub strategy: RetrievalStrategy,
|
||||
}
|
||||
|
||||
impl Default for RetrievalSettings {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
chunk_vector_take: None,
|
||||
chunk_fts_take: None,
|
||||
chunk_avg_chars_per_token: None,
|
||||
max_chunks_per_entity: None,
|
||||
rerank: false,
|
||||
rerank_pool_size: 4,
|
||||
rerank_keep_top: 10,
|
||||
chunk_result_cap: 5,
|
||||
chunk_rrf_k: None,
|
||||
chunk_rrf_vector_weight: None,
|
||||
chunk_rrf_fts_weight: None,
|
||||
chunk_rrf_use_vector: None,
|
||||
chunk_rrf_use_fts: None,
|
||||
require_verified_chunks: true,
|
||||
strategy: RetrievalStrategy::Default,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Args)]
|
||||
pub struct IngestConfig {
|
||||
/// Directory for ingestion corpora caches
|
||||
#[arg(long, default_value_os_t = default_ingestion_cache_dir())]
|
||||
pub ingestion_cache_dir: PathBuf,
|
||||
|
||||
/// Minimum tokens per chunk for ingestion
|
||||
#[arg(long, default_value_t = 256)]
|
||||
pub ingest_chunk_min_tokens: usize,
|
||||
|
||||
/// Maximum tokens per chunk for ingestion
|
||||
#[arg(long, default_value_t = 512)]
|
||||
pub ingest_chunk_max_tokens: usize,
|
||||
|
||||
/// Overlap between chunks during ingestion (tokens)
|
||||
#[arg(long, default_value_t = 50)]
|
||||
pub ingest_chunk_overlap_tokens: usize,
|
||||
|
||||
/// Run ingestion in chunk-only mode (skip analyzer/graph generation)
|
||||
#[arg(long)]
|
||||
pub ingest_chunks_only: bool,
|
||||
|
||||
/// Number of paragraphs to ingest concurrently
|
||||
#[arg(long, default_value_t = 10)]
|
||||
pub ingestion_batch_size: usize,
|
||||
|
||||
/// Maximum retries for ingestion failures per paragraph
|
||||
#[arg(long, default_value_t = 3)]
|
||||
pub ingestion_max_retries: usize,
|
||||
|
||||
/// Recompute embeddings for cached corpora without re-running ingestion
|
||||
#[arg(long, alias = "refresh-embeddings")]
|
||||
pub refresh_embeddings_only: bool,
|
||||
|
||||
/// Delete cached paragraph shards before rebuilding the ingestion corpus
|
||||
#[arg(long)]
|
||||
pub slice_reset_ingestion: bool,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Args)]
|
||||
pub struct DatabaseArgs {
|
||||
/// SurrealDB server endpoint
|
||||
#[arg(long, default_value = "ws://127.0.0.1:8000", env = "EVAL_DB_ENDPOINT")]
|
||||
pub db_endpoint: String,
|
||||
|
||||
/// SurrealDB root username
|
||||
#[arg(long, default_value = "root_user", env = "EVAL_DB_USERNAME")]
|
||||
pub db_username: String,
|
||||
|
||||
/// SurrealDB root password
|
||||
#[arg(long, default_value = "root_password", env = "EVAL_DB_PASSWORD")]
|
||||
pub db_password: String,
|
||||
|
||||
/// Override the namespace used on the SurrealDB server
|
||||
#[arg(long, env = "EVAL_DB_NAMESPACE")]
|
||||
pub db_namespace: Option<String>,
|
||||
|
||||
/// Override the database used on the SurrealDB server
|
||||
#[arg(long, env = "EVAL_DB_DATABASE")]
|
||||
pub db_database: Option<String>,
|
||||
|
||||
/// Path to inspect DB state
|
||||
#[arg(long)]
|
||||
pub inspect_db_state: Option<PathBuf>,
|
||||
}
|
||||
|
||||
#[derive(Parser, Debug, Clone)]
|
||||
#[command(author, version, about, long_about = None)]
|
||||
pub struct Config {
|
||||
/// Convert the selected dataset and exit
|
||||
#[arg(long)]
|
||||
pub convert_only: bool,
|
||||
|
||||
/// Regenerate the converted dataset even if it already exists
|
||||
#[arg(long, alias = "refresh")]
|
||||
pub force_convert: bool,
|
||||
|
||||
/// Dataset to evaluate
|
||||
#[arg(long, default_value_t = DatasetKind::default())]
|
||||
pub dataset: DatasetKind,
|
||||
|
||||
/// Enable LLM-assisted evaluation features (includes unanswerable cases)
|
||||
#[arg(long)]
|
||||
pub llm_mode: bool,
|
||||
|
||||
/// Cap the slice corpus size (positives + negatives)
|
||||
#[arg(long)]
|
||||
pub corpus_limit: Option<usize>,
|
||||
|
||||
/// Path to the raw dataset (defaults per dataset)
|
||||
#[arg(long)]
|
||||
pub raw: Option<PathBuf>,
|
||||
|
||||
/// Path to write/read the converted dataset (defaults per dataset)
|
||||
#[arg(long)]
|
||||
pub converted: Option<PathBuf>,
|
||||
|
||||
/// Directory to write evaluation reports
|
||||
#[arg(long, default_value_os_t = default_report_dir())]
|
||||
pub report_dir: PathBuf,
|
||||
|
||||
/// Precision@k cutoff
|
||||
#[arg(long, default_value_t = 5)]
|
||||
pub k: usize,
|
||||
|
||||
/// Limit the number of questions evaluated (0 = all)
|
||||
#[arg(long = "limit", default_value_t = 200)]
|
||||
pub limit_arg: usize,
|
||||
|
||||
/// Number of mismatches to surface in the Markdown summary
|
||||
#[arg(long, default_value_t = 5)]
|
||||
pub sample: usize,
|
||||
|
||||
/// Disable context cropping when converting datasets (ingest entire documents)
|
||||
#[arg(long)]
|
||||
pub full_context: bool,
|
||||
|
||||
#[command(flatten)]
|
||||
pub retrieval: RetrievalSettings,
|
||||
|
||||
/// Concurrency level
|
||||
#[arg(long, default_value_t = 1)]
|
||||
pub concurrency: usize,
|
||||
|
||||
/// Embedding backend
|
||||
#[arg(long, default_value_t = EmbeddingBackend::FastEmbed)]
|
||||
pub embedding_backend: EmbeddingBackend,
|
||||
|
||||
/// FastEmbed model code
|
||||
#[arg(long)]
|
||||
pub embedding_model: Option<String>,
|
||||
|
||||
/// Directory for embedding caches
|
||||
#[arg(long, default_value_os_t = default_cache_dir())]
|
||||
pub cache_dir: PathBuf,
|
||||
|
||||
#[command(flatten)]
|
||||
pub ingest: IngestConfig,
|
||||
|
||||
/// Include entity descriptions and categories in JSON reports
|
||||
#[arg(long)]
|
||||
pub detailed_report: bool,
|
||||
|
||||
/// Use a cached dataset slice by id or path
|
||||
#[arg(long)]
|
||||
pub slice: Option<String>,
|
||||
|
||||
/// Ignore cached corpus state and rebuild the slice's SurrealDB corpus
|
||||
#[arg(long)]
|
||||
pub reseed_slice: bool,
|
||||
|
||||
/// Slice seed
|
||||
#[arg(skip = DEFAULT_SLICE_SEED)]
|
||||
pub slice_seed: u64,
|
||||
|
||||
/// Grow the slice ledger to contain at least this many answerable cases, then exit
|
||||
#[arg(long)]
|
||||
pub slice_grow: Option<usize>,
|
||||
|
||||
/// Evaluate questions starting at this offset within the slice
|
||||
#[arg(long, default_value_t = 0)]
|
||||
pub slice_offset: usize,
|
||||
|
||||
/// Target negative-to-positive paragraph ratio for slice growth
|
||||
#[arg(long, default_value_t = crate::slice::DEFAULT_NEGATIVE_MULTIPLIER)]
|
||||
pub negative_multiplier: f32,
|
||||
|
||||
/// Annotate the run; label is stored in JSON/Markdown reports
|
||||
#[arg(long)]
|
||||
pub label: Option<String>,
|
||||
|
||||
/// Write per-query chunk diagnostics JSONL to the provided path
|
||||
#[arg(long, alias = "chunk-diagnostics")]
|
||||
pub chunk_diagnostics_path: Option<PathBuf>,
|
||||
|
||||
/// Inspect an ingestion cache question and exit
|
||||
#[arg(long)]
|
||||
pub inspect_question: Option<String>,
|
||||
|
||||
/// Path to an ingestion cache manifest JSON for inspection mode
|
||||
#[arg(long)]
|
||||
pub inspect_manifest: Option<PathBuf>,
|
||||
|
||||
/// Override the SurrealDB system settings query model
|
||||
#[arg(long)]
|
||||
pub query_model: Option<String>,
|
||||
|
||||
/// Write structured performance telemetry JSON to the provided path
|
||||
#[arg(long)]
|
||||
pub perf_log_json: Option<PathBuf>,
|
||||
|
||||
/// Directory that receives timestamped perf JSON copies
|
||||
#[arg(long)]
|
||||
pub perf_log_dir: Option<PathBuf>,
|
||||
|
||||
/// Print per-stage performance timings to stdout after the run
|
||||
#[arg(long, alias = "perf-log")]
|
||||
pub perf_log_console: bool,
|
||||
|
||||
#[command(flatten)]
|
||||
pub database: DatabaseArgs,
|
||||
|
||||
// Computed fields (not arguments)
|
||||
#[arg(skip)]
|
||||
pub raw_dataset_path: PathBuf,
|
||||
#[arg(skip)]
|
||||
pub converted_dataset_path: PathBuf,
|
||||
#[arg(skip)]
|
||||
pub limit: Option<usize>,
|
||||
#[arg(skip)]
|
||||
pub summary_sample: usize,
|
||||
}
|
||||
|
||||
impl Config {
|
||||
pub fn context_token_limit(&self) -> Option<usize> {
|
||||
None
|
||||
}
|
||||
|
||||
pub fn finalize(&mut self) -> Result<()> {
|
||||
// Handle dataset paths
|
||||
if let Some(raw) = &self.raw {
|
||||
self.raw_dataset_path = raw.clone();
|
||||
} else {
|
||||
self.raw_dataset_path = self.dataset.default_raw_path();
|
||||
}
|
||||
|
||||
if let Some(converted) = &self.converted {
|
||||
self.converted_dataset_path = converted.clone();
|
||||
} else {
|
||||
self.converted_dataset_path = self.dataset.default_converted_path();
|
||||
}
|
||||
|
||||
// Handle limit
|
||||
if self.limit_arg == 0 {
|
||||
self.limit = None;
|
||||
} else {
|
||||
self.limit = Some(self.limit_arg);
|
||||
}
|
||||
|
||||
// Handle sample
|
||||
self.summary_sample = self.sample.max(1);
|
||||
|
||||
// Handle retrieval settings
|
||||
self.retrieval.require_verified_chunks = !self.llm_mode;
|
||||
|
||||
if self.dataset == DatasetKind::Beir {
|
||||
self.negative_multiplier = 9.0;
|
||||
}
|
||||
|
||||
// Validations
|
||||
if self.ingest.ingest_chunk_min_tokens == 0
|
||||
|| self.ingest.ingest_chunk_min_tokens >= self.ingest.ingest_chunk_max_tokens
|
||||
{
|
||||
return Err(anyhow!(
|
||||
"--ingest-chunk-min-tokens must be greater than zero and less than --ingest-chunk-max-tokens (got {} >= {})",
|
||||
self.ingest.ingest_chunk_min_tokens,
|
||||
self.ingest.ingest_chunk_max_tokens
|
||||
));
|
||||
}
|
||||
|
||||
if self.ingest.ingest_chunk_overlap_tokens >= self.ingest.ingest_chunk_min_tokens {
|
||||
return Err(anyhow!(
|
||||
"--ingest-chunk-overlap-tokens ({}) must be less than --ingest-chunk-min-tokens ({})",
|
||||
self.ingest.ingest_chunk_overlap_tokens,
|
||||
self.ingest.ingest_chunk_min_tokens
|
||||
));
|
||||
}
|
||||
|
||||
if self.retrieval.rerank && self.retrieval.rerank_pool_size == 0 {
|
||||
return Err(anyhow!(
|
||||
"--rerank-pool must be greater than zero when reranking is enabled"
|
||||
));
|
||||
}
|
||||
|
||||
if let Some(k) = self.retrieval.chunk_rrf_k {
|
||||
if k <= 0.0 || !k.is_finite() {
|
||||
return Err(anyhow!(
|
||||
"--chunk-rrf-k must be a positive, finite number (got {k})"
|
||||
));
|
||||
}
|
||||
}
|
||||
if let Some(weight) = self.retrieval.chunk_rrf_vector_weight {
|
||||
if weight < 0.0 || !weight.is_finite() {
|
||||
return Err(anyhow!(
|
||||
"--chunk-rrf-vector-weight must be a non-negative, finite number (got {weight})"
|
||||
));
|
||||
}
|
||||
}
|
||||
if let Some(weight) = self.retrieval.chunk_rrf_fts_weight {
|
||||
if weight < 0.0 || !weight.is_finite() {
|
||||
return Err(anyhow!(
|
||||
"--chunk-rrf-fts-weight must be a non-negative, finite number (got {weight})"
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
if self.concurrency == 0 {
|
||||
return Err(anyhow!("--concurrency must be greater than zero"));
|
||||
}
|
||||
|
||||
if self.embedding_backend == EmbeddingBackend::Hashed && self.embedding_model.is_some() {
|
||||
return Err(anyhow!(
|
||||
"--embedding-model cannot be used with the 'hashed' embedding backend"
|
||||
));
|
||||
}
|
||||
|
||||
if let Some(query_model) = &self.query_model {
|
||||
if query_model.trim().is_empty() {
|
||||
return Err(anyhow!("--query-model requires a non-empty model name"));
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(grow) = self.slice_grow {
|
||||
if grow == 0 {
|
||||
return Err(anyhow!("--slice-grow must be greater than zero"));
|
||||
}
|
||||
}
|
||||
|
||||
if self.negative_multiplier <= 0.0 || !self.negative_multiplier.is_finite() {
|
||||
return Err(anyhow!(
|
||||
"--negative-multiplier must be a positive finite number"
|
||||
));
|
||||
}
|
||||
|
||||
// Handle corpus limit logic
|
||||
if let Some(limit) = self.limit {
|
||||
if let Some(corpus_limit) = self.corpus_limit {
|
||||
if corpus_limit < limit {
|
||||
self.corpus_limit = Some(limit);
|
||||
}
|
||||
} else {
|
||||
let default_multiplier = 10usize;
|
||||
let mut computed = limit.saturating_mul(default_multiplier);
|
||||
if computed < limit {
|
||||
computed = limit;
|
||||
}
|
||||
let max_cap = 1_000usize;
|
||||
if computed > max_cap {
|
||||
computed = max_cap;
|
||||
}
|
||||
self.corpus_limit = Some(computed);
|
||||
}
|
||||
}
|
||||
|
||||
// Handle perf log dir env var fallback
|
||||
if self.perf_log_dir.is_none() {
|
||||
if let Ok(dir) = env::var("EVAL_PERF_LOG_DIR") {
|
||||
if !dir.trim().is_empty() {
|
||||
self.perf_log_dir = Some(PathBuf::from(dir));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
pub struct ParsedArgs {
|
||||
pub config: Config,
|
||||
}
|
||||
|
||||
pub fn parse() -> Result<ParsedArgs> {
|
||||
let mut config = Config::parse();
|
||||
config.finalize()?;
|
||||
Ok(ParsedArgs { config })
|
||||
}
|
||||
|
||||
pub fn ensure_parent(path: &Path) -> Result<()> {
|
||||
if let Some(parent) = path.parent() {
|
||||
std::fs::create_dir_all(parent)
|
||||
.with_context(|| format!("creating parent directory for {}", path.display()))?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
88
evaluations/src/cache.rs
Normal file
88
evaluations/src/cache.rs
Normal file
@@ -0,0 +1,88 @@
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
path::{Path, PathBuf},
|
||||
sync::{
|
||||
atomic::{AtomicBool, Ordering},
|
||||
Arc,
|
||||
},
|
||||
};
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tokio::sync::Mutex;
|
||||
|
||||
#[derive(Debug, Default, Serialize, Deserialize)]
|
||||
struct EmbeddingCacheData {
|
||||
entities: HashMap<String, Vec<f32>>,
|
||||
chunks: HashMap<String, Vec<f32>>,
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct EmbeddingCache {
|
||||
path: Arc<PathBuf>,
|
||||
data: Arc<Mutex<EmbeddingCacheData>>,
|
||||
dirty: Arc<AtomicBool>,
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
impl EmbeddingCache {
|
||||
pub async fn load(path: impl AsRef<Path>) -> Result<Self> {
|
||||
let path = path.as_ref().to_path_buf();
|
||||
let data = if path.exists() {
|
||||
let raw = tokio::fs::read(&path)
|
||||
.await
|
||||
.with_context(|| format!("reading embedding cache {}", path.display()))?;
|
||||
serde_json::from_slice(&raw)
|
||||
.with_context(|| format!("parsing embedding cache {}", path.display()))?
|
||||
} else {
|
||||
EmbeddingCacheData::default()
|
||||
};
|
||||
|
||||
Ok(Self {
|
||||
path: Arc::new(path),
|
||||
data: Arc::new(Mutex::new(data)),
|
||||
dirty: Arc::new(AtomicBool::new(false)),
|
||||
})
|
||||
}
|
||||
|
||||
pub async fn get_entity(&self, id: &str) -> Option<Vec<f32>> {
|
||||
let guard = self.data.lock().await;
|
||||
guard.entities.get(id).cloned()
|
||||
}
|
||||
|
||||
pub async fn insert_entity(&self, id: String, embedding: Vec<f32>) {
|
||||
let mut guard = self.data.lock().await;
|
||||
guard.entities.insert(id, embedding);
|
||||
self.dirty.store(true, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
pub async fn get_chunk(&self, id: &str) -> Option<Vec<f32>> {
|
||||
let guard = self.data.lock().await;
|
||||
guard.chunks.get(id).cloned()
|
||||
}
|
||||
|
||||
pub async fn insert_chunk(&self, id: String, embedding: Vec<f32>) {
|
||||
let mut guard = self.data.lock().await;
|
||||
guard.chunks.insert(id, embedding);
|
||||
self.dirty.store(true, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
pub async fn persist(&self) -> Result<()> {
|
||||
if !self.dirty.load(Ordering::Relaxed) {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let guard = self.data.lock().await;
|
||||
let body = serde_json::to_vec_pretty(&*guard).context("serialising embedding cache")?;
|
||||
if let Some(parent) = self.path.parent() {
|
||||
tokio::fs::create_dir_all(parent)
|
||||
.await
|
||||
.with_context(|| format!("creating cache directory {}", parent.display()))?;
|
||||
}
|
||||
tokio::fs::write(&*self.path, body)
|
||||
.await
|
||||
.with_context(|| format!("writing embedding cache {}", self.path.display()))?;
|
||||
self.dirty.store(false, Ordering::Relaxed);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
187
evaluations/src/cases.rs
Normal file
187
evaluations/src/cases.rs
Normal file
@@ -0,0 +1,187 @@
|
||||
//! Case generation from corpus manifests.
|
||||
|
||||
use std::collections::HashMap;
|
||||
|
||||
use crate::corpus;
|
||||
|
||||
/// A test case for retrieval evaluation derived from a manifest question.
|
||||
pub(crate) struct SeededCase {
|
||||
pub question_id: String,
|
||||
pub question: String,
|
||||
pub expected_source: String,
|
||||
pub answers: Vec<String>,
|
||||
pub paragraph_id: String,
|
||||
pub paragraph_title: String,
|
||||
pub expected_chunk_ids: Vec<String>,
|
||||
pub is_impossible: bool,
|
||||
pub has_verified_chunks: bool,
|
||||
}
|
||||
|
||||
/// Convert a corpus manifest into seeded evaluation cases.
|
||||
pub(crate) fn cases_from_manifest(manifest: &corpus::CorpusManifest) -> Vec<SeededCase> {
|
||||
let mut title_map = HashMap::new();
|
||||
for paragraph in &manifest.paragraphs {
|
||||
title_map.insert(paragraph.paragraph_id.as_str(), paragraph.title.clone());
|
||||
}
|
||||
|
||||
let include_impossible = manifest.metadata.include_unanswerable;
|
||||
let require_verified_chunks = manifest.metadata.require_verified_chunks;
|
||||
|
||||
manifest
|
||||
.questions
|
||||
.iter()
|
||||
.filter(|question| {
|
||||
should_include_question(question, include_impossible, require_verified_chunks)
|
||||
})
|
||||
.map(|question| {
|
||||
let title = title_map
|
||||
.get(question.paragraph_id.as_str())
|
||||
.cloned()
|
||||
.unwrap_or_else(|| "Untitled".to_string());
|
||||
SeededCase {
|
||||
question_id: question.question_id.clone(),
|
||||
question: question.question_text.clone(),
|
||||
expected_source: question.text_content_id.clone(),
|
||||
answers: question.answers.clone(),
|
||||
paragraph_id: question.paragraph_id.clone(),
|
||||
paragraph_title: title,
|
||||
expected_chunk_ids: question.matching_chunk_ids.clone(),
|
||||
is_impossible: question.is_impossible,
|
||||
has_verified_chunks: !question.matching_chunk_ids.is_empty(),
|
||||
}
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn should_include_question(
|
||||
question: &corpus::CorpusQuestion,
|
||||
include_impossible: bool,
|
||||
require_verified_chunks: bool,
|
||||
) -> bool {
|
||||
if !include_impossible && question.is_impossible {
|
||||
return false;
|
||||
}
|
||||
if require_verified_chunks && question.matching_chunk_ids.is_empty() {
|
||||
return false;
|
||||
}
|
||||
true
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::corpus::store::{CorpusParagraph, EmbeddedKnowledgeEntity, EmbeddedTextChunk};
|
||||
use crate::corpus::{CorpusManifest, CorpusMetadata, CorpusQuestion, MANIFEST_VERSION};
|
||||
use chrono::Utc;
|
||||
use common::storage::types::text_content::TextContent;
|
||||
|
||||
fn sample_manifest() -> CorpusManifest {
|
||||
let paragraphs = vec![
|
||||
CorpusParagraph {
|
||||
paragraph_id: "p1".to_string(),
|
||||
title: "Alpha".to_string(),
|
||||
text_content: TextContent::new(
|
||||
"alpha context".to_string(),
|
||||
None,
|
||||
"test".to_string(),
|
||||
None,
|
||||
None,
|
||||
"user".to_string(),
|
||||
),
|
||||
entities: Vec::<EmbeddedKnowledgeEntity>::new(),
|
||||
relationships: Vec::new(),
|
||||
chunks: Vec::<EmbeddedTextChunk>::new(),
|
||||
},
|
||||
CorpusParagraph {
|
||||
paragraph_id: "p2".to_string(),
|
||||
title: "Beta".to_string(),
|
||||
text_content: TextContent::new(
|
||||
"beta context".to_string(),
|
||||
None,
|
||||
"test".to_string(),
|
||||
None,
|
||||
None,
|
||||
"user".to_string(),
|
||||
),
|
||||
entities: Vec::<EmbeddedKnowledgeEntity>::new(),
|
||||
relationships: Vec::new(),
|
||||
chunks: Vec::<EmbeddedTextChunk>::new(),
|
||||
},
|
||||
];
|
||||
let questions = vec![
|
||||
CorpusQuestion {
|
||||
question_id: "q1".to_string(),
|
||||
paragraph_id: "p1".to_string(),
|
||||
text_content_id: "tc-alpha".to_string(),
|
||||
question_text: "What is Alpha?".to_string(),
|
||||
answers: vec!["Alpha".to_string()],
|
||||
is_impossible: false,
|
||||
matching_chunk_ids: vec!["chunk-alpha".to_string()],
|
||||
},
|
||||
CorpusQuestion {
|
||||
question_id: "q2".to_string(),
|
||||
paragraph_id: "p1".to_string(),
|
||||
text_content_id: "tc-alpha".to_string(),
|
||||
question_text: "Unanswerable?".to_string(),
|
||||
answers: Vec::new(),
|
||||
is_impossible: true,
|
||||
matching_chunk_ids: Vec::new(),
|
||||
},
|
||||
CorpusQuestion {
|
||||
question_id: "q3".to_string(),
|
||||
paragraph_id: "p2".to_string(),
|
||||
text_content_id: "tc-beta".to_string(),
|
||||
question_text: "Where is Beta?".to_string(),
|
||||
answers: vec!["Beta".to_string()],
|
||||
is_impossible: false,
|
||||
matching_chunk_ids: Vec::new(),
|
||||
},
|
||||
];
|
||||
CorpusManifest {
|
||||
version: MANIFEST_VERSION,
|
||||
metadata: CorpusMetadata {
|
||||
dataset_id: "ds".to_string(),
|
||||
dataset_label: "Dataset".to_string(),
|
||||
slice_id: "slice".to_string(),
|
||||
include_unanswerable: true,
|
||||
require_verified_chunks: true,
|
||||
ingestion_fingerprint: "fp".to_string(),
|
||||
embedding_backend: "test".to_string(),
|
||||
embedding_model: None,
|
||||
embedding_dimension: 3,
|
||||
converted_checksum: "chk".to_string(),
|
||||
generated_at: Utc::now(),
|
||||
paragraph_count: paragraphs.len(),
|
||||
question_count: questions.len(),
|
||||
chunk_min_tokens: 1,
|
||||
chunk_max_tokens: 10,
|
||||
chunk_only: false,
|
||||
},
|
||||
paragraphs,
|
||||
questions,
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn cases_respect_mode_filters() {
|
||||
let mut manifest = sample_manifest();
|
||||
manifest.metadata.include_unanswerable = false;
|
||||
manifest.metadata.require_verified_chunks = true;
|
||||
|
||||
let strict_cases = cases_from_manifest(&manifest);
|
||||
assert_eq!(strict_cases.len(), 1);
|
||||
assert_eq!(strict_cases[0].question_id, "q1");
|
||||
assert_eq!(strict_cases[0].paragraph_title, "Alpha");
|
||||
|
||||
let mut llm_manifest = manifest.clone();
|
||||
llm_manifest.metadata.include_unanswerable = true;
|
||||
llm_manifest.metadata.require_verified_chunks = false;
|
||||
|
||||
let llm_cases = cases_from_manifest(&llm_manifest);
|
||||
let ids: Vec<_> = llm_cases
|
||||
.iter()
|
||||
.map(|case| case.question_id.as_str())
|
||||
.collect();
|
||||
assert_eq!(ids, vec!["q1", "q2", "q3"]);
|
||||
}
|
||||
}
|
||||
42
evaluations/src/corpus/config.rs
Normal file
42
evaluations/src/corpus/config.rs
Normal file
@@ -0,0 +1,42 @@
|
||||
use std::path::PathBuf;
|
||||
|
||||
use crate::args::Config;
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct CorpusCacheConfig {
|
||||
pub ingestion_cache_dir: PathBuf,
|
||||
pub force_refresh: bool,
|
||||
pub refresh_embeddings_only: bool,
|
||||
pub ingestion_batch_size: usize,
|
||||
pub ingestion_max_retries: usize,
|
||||
}
|
||||
|
||||
impl CorpusCacheConfig {
|
||||
pub fn new(
|
||||
ingestion_cache_dir: impl Into<PathBuf>,
|
||||
force_refresh: bool,
|
||||
refresh_embeddings_only: bool,
|
||||
ingestion_batch_size: usize,
|
||||
ingestion_max_retries: usize,
|
||||
) -> Self {
|
||||
Self {
|
||||
ingestion_cache_dir: ingestion_cache_dir.into(),
|
||||
force_refresh,
|
||||
refresh_embeddings_only,
|
||||
ingestion_batch_size,
|
||||
ingestion_max_retries,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<&Config> for CorpusCacheConfig {
|
||||
fn from(config: &Config) -> Self {
|
||||
CorpusCacheConfig::new(
|
||||
config.ingest.ingestion_cache_dir.clone(),
|
||||
config.force_convert || config.ingest.slice_reset_ingestion,
|
||||
config.ingest.refresh_embeddings_only,
|
||||
config.ingest.ingestion_batch_size,
|
||||
config.ingest.ingestion_max_retries,
|
||||
)
|
||||
}
|
||||
}
|
||||
26
evaluations/src/corpus/mod.rs
Normal file
26
evaluations/src/corpus/mod.rs
Normal file
@@ -0,0 +1,26 @@
|
||||
mod config;
|
||||
mod orchestrator;
|
||||
pub(crate) mod store;
|
||||
|
||||
pub use config::CorpusCacheConfig;
|
||||
pub use orchestrator::{
|
||||
cached_corpus_dir, compute_ingestion_fingerprint, corpus_handle_from_manifest, ensure_corpus,
|
||||
load_cached_manifest,
|
||||
};
|
||||
pub use store::{
|
||||
seed_manifest_into_db, window_manifest, CorpusHandle, CorpusManifest, CorpusMetadata,
|
||||
CorpusQuestion, EmbeddedKnowledgeEntity, EmbeddedTextChunk, ParagraphShard,
|
||||
ParagraphShardStore, MANIFEST_VERSION,
|
||||
};
|
||||
|
||||
pub fn make_ingestion_config(config: &crate::args::Config) -> ingestion_pipeline::IngestionConfig {
|
||||
ingestion_pipeline::IngestionConfig {
|
||||
tuning: ingestion_pipeline::IngestionTuning {
|
||||
chunk_min_tokens: config.ingest.ingest_chunk_min_tokens,
|
||||
chunk_max_tokens: config.ingest.ingest_chunk_max_tokens,
|
||||
chunk_overlap_tokens: config.ingest.ingest_chunk_overlap_tokens,
|
||||
..Default::default()
|
||||
},
|
||||
chunk_only: config.ingest.ingest_chunks_only,
|
||||
}
|
||||
}
|
||||
806
evaluations/src/corpus/orchestrator.rs
Normal file
806
evaluations/src/corpus/orchestrator.rs
Normal file
@@ -0,0 +1,806 @@
|
||||
use std::{
|
||||
collections::{HashMap, HashSet},
|
||||
fs,
|
||||
io::Read,
|
||||
path::{Path, PathBuf},
|
||||
sync::Arc,
|
||||
};
|
||||
|
||||
use anyhow::{anyhow, Context, Result};
|
||||
use async_openai::Client;
|
||||
use chrono::Utc;
|
||||
#[cfg(not(test))]
|
||||
use common::utils::config::get_config;
|
||||
use common::{
|
||||
storage::{
|
||||
db::SurrealDbClient,
|
||||
store::{DynStore, StorageManager},
|
||||
types::{ingestion_payload::IngestionPayload, ingestion_task::IngestionTask, StoredObject},
|
||||
},
|
||||
utils::config::{AppConfig, StorageKind},
|
||||
};
|
||||
use futures::future::try_join_all;
|
||||
use ingestion_pipeline::{IngestionConfig, IngestionPipeline};
|
||||
use object_store::memory::InMemory;
|
||||
use sha2::{Digest, Sha256};
|
||||
use tracing::{info, warn};
|
||||
use uuid::Uuid;
|
||||
|
||||
use crate::{
|
||||
datasets::{ConvertedDataset, ConvertedParagraph, ConvertedQuestion},
|
||||
slice::{self, ResolvedSlice, SliceParagraphKind},
|
||||
};
|
||||
|
||||
use crate::corpus::{
|
||||
CorpusCacheConfig, CorpusHandle, CorpusManifest, CorpusMetadata, CorpusQuestion,
|
||||
EmbeddedKnowledgeEntity, EmbeddedTextChunk, ParagraphShard, ParagraphShardStore,
|
||||
MANIFEST_VERSION,
|
||||
};
|
||||
|
||||
const INGESTION_SPEC_VERSION: u32 = 2;
|
||||
|
||||
type OpenAIClient = Client<async_openai::config::OpenAIConfig>;
|
||||
|
||||
#[derive(Clone)]
|
||||
struct ParagraphShardRecord {
|
||||
shard: ParagraphShard,
|
||||
dirty: bool,
|
||||
needs_reembed: bool,
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
struct IngestRequest<'a> {
|
||||
slot: usize,
|
||||
paragraph: &'a ConvertedParagraph,
|
||||
shard_path: String,
|
||||
question_refs: Vec<&'a ConvertedQuestion>,
|
||||
}
|
||||
|
||||
impl<'a> IngestRequest<'a> {
|
||||
fn from_entry(
|
||||
slot: usize,
|
||||
paragraph: &'a ConvertedParagraph,
|
||||
entry: &'a slice::SliceParagraphEntry,
|
||||
) -> Result<Self> {
|
||||
let shard_path = entry
|
||||
.shard_path
|
||||
.clone()
|
||||
.unwrap_or_else(|| slice::default_shard_path(&entry.id));
|
||||
let question_refs = match &entry.kind {
|
||||
SliceParagraphKind::Positive { question_ids } => question_ids
|
||||
.iter()
|
||||
.map(|id| {
|
||||
paragraph
|
||||
.questions
|
||||
.iter()
|
||||
.find(|question| question.id == *id)
|
||||
.ok_or_else(|| {
|
||||
anyhow!(
|
||||
"paragraph '{}' missing question '{}' referenced by slice",
|
||||
paragraph.id,
|
||||
id
|
||||
)
|
||||
})
|
||||
})
|
||||
.collect::<Result<Vec<_>>>()?,
|
||||
SliceParagraphKind::Negative => Vec::new(),
|
||||
};
|
||||
Ok(Self {
|
||||
slot,
|
||||
paragraph,
|
||||
shard_path,
|
||||
question_refs,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
struct ParagraphPlan<'a> {
|
||||
slot: usize,
|
||||
entry: &'a slice::SliceParagraphEntry,
|
||||
paragraph: &'a ConvertedParagraph,
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
struct IngestionStats {
|
||||
positive_reused: usize,
|
||||
positive_ingested: usize,
|
||||
negative_reused: usize,
|
||||
negative_ingested: usize,
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub async fn ensure_corpus(
|
||||
dataset: &ConvertedDataset,
|
||||
slice: &ResolvedSlice<'_>,
|
||||
window: &slice::SliceWindow<'_>,
|
||||
cache: &CorpusCacheConfig,
|
||||
embedding: Arc<common::utils::embedding::EmbeddingProvider>,
|
||||
openai: Arc<OpenAIClient>,
|
||||
user_id: &str,
|
||||
converted_path: &Path,
|
||||
ingestion_config: IngestionConfig,
|
||||
) -> Result<CorpusHandle> {
|
||||
let checksum = compute_file_checksum(converted_path)
|
||||
.with_context(|| format!("computing checksum for {}", converted_path.display()))?;
|
||||
let ingestion_fingerprint =
|
||||
build_ingestion_fingerprint(dataset, slice, &checksum, &ingestion_config);
|
||||
|
||||
let base_dir = cached_corpus_dir(
|
||||
cache,
|
||||
dataset.metadata.id.as_str(),
|
||||
slice.manifest.slice_id.as_str(),
|
||||
);
|
||||
if cache.force_refresh && !cache.refresh_embeddings_only {
|
||||
let _ = fs::remove_dir_all(&base_dir);
|
||||
}
|
||||
let store = ParagraphShardStore::new(base_dir.clone());
|
||||
store.ensure_base_dir()?;
|
||||
|
||||
let positive_set: HashSet<&str> = window.positive_ids().collect();
|
||||
let require_verified_chunks = slice.manifest.require_verified_chunks;
|
||||
let embedding_backend_label = embedding.backend_label().to_string();
|
||||
let embedding_model_code = embedding.model_code();
|
||||
let embedding_dimension = embedding.dimension();
|
||||
if positive_set.is_empty() {
|
||||
return Err(anyhow!(
|
||||
"window selection contains zero positive paragraphs for slice '{}'",
|
||||
slice.manifest.slice_id
|
||||
));
|
||||
}
|
||||
|
||||
let desired_negatives =
|
||||
((positive_set.len() as f32) * slice.manifest.negative_multiplier).ceil() as usize;
|
||||
let mut plan = Vec::new();
|
||||
let mut negatives_added = 0usize;
|
||||
for (idx, entry) in slice.manifest.paragraphs.iter().enumerate() {
|
||||
let include = match &entry.kind {
|
||||
SliceParagraphKind::Positive { .. } => positive_set.contains(entry.id.as_str()),
|
||||
SliceParagraphKind::Negative => {
|
||||
negatives_added < desired_negatives && {
|
||||
negatives_added += 1;
|
||||
true
|
||||
}
|
||||
}
|
||||
};
|
||||
if include {
|
||||
let paragraph = slice
|
||||
.paragraphs
|
||||
.get(idx)
|
||||
.copied()
|
||||
.ok_or_else(|| anyhow!("slice missing paragraph index {}", idx))?;
|
||||
plan.push(ParagraphPlan {
|
||||
slot: plan.len(),
|
||||
entry,
|
||||
paragraph,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
if plan.is_empty() {
|
||||
return Err(anyhow!(
|
||||
"no paragraphs selected for ingestion (slice '{}')",
|
||||
slice.manifest.slice_id
|
||||
));
|
||||
}
|
||||
|
||||
let mut records: Vec<Option<ParagraphShardRecord>> = vec![None; plan.len()];
|
||||
let mut ingest_requests = Vec::new();
|
||||
let mut stats = IngestionStats::default();
|
||||
|
||||
for plan_entry in &plan {
|
||||
let shard_path = plan_entry
|
||||
.entry
|
||||
.shard_path
|
||||
.clone()
|
||||
.unwrap_or_else(|| slice::default_shard_path(&plan_entry.entry.id));
|
||||
let shard = if cache.force_refresh {
|
||||
None
|
||||
} else {
|
||||
store.load(&shard_path, &ingestion_fingerprint)?
|
||||
};
|
||||
if let Some(shard) = shard {
|
||||
let model_matches = shard.embedding_model.as_deref() == embedding_model_code.as_deref();
|
||||
let needs_reembed = shard.embedding_backend != embedding_backend_label
|
||||
|| shard.embedding_dimension != embedding_dimension
|
||||
|| !model_matches;
|
||||
match plan_entry.entry.kind {
|
||||
SliceParagraphKind::Positive { .. } => stats.positive_reused += 1,
|
||||
SliceParagraphKind::Negative => stats.negative_reused += 1,
|
||||
}
|
||||
records[plan_entry.slot] = Some(ParagraphShardRecord {
|
||||
shard,
|
||||
dirty: false,
|
||||
needs_reembed,
|
||||
});
|
||||
} else {
|
||||
match plan_entry.entry.kind {
|
||||
SliceParagraphKind::Positive { .. } => stats.positive_ingested += 1,
|
||||
SliceParagraphKind::Negative => stats.negative_ingested += 1,
|
||||
}
|
||||
let request =
|
||||
IngestRequest::from_entry(plan_entry.slot, plan_entry.paragraph, plan_entry.entry)?;
|
||||
ingest_requests.push(request);
|
||||
}
|
||||
}
|
||||
|
||||
if cache.refresh_embeddings_only && !ingest_requests.is_empty() {
|
||||
return Err(anyhow!(
|
||||
"--refresh-embeddings requested but {} shard(s) missing for dataset '{}' slice '{}'",
|
||||
ingest_requests.len(),
|
||||
dataset.metadata.id,
|
||||
slice.manifest.slice_id
|
||||
));
|
||||
}
|
||||
|
||||
if !ingest_requests.is_empty() {
|
||||
let new_shards = ingest_paragraph_batch(
|
||||
dataset,
|
||||
&ingest_requests,
|
||||
embedding.clone(),
|
||||
openai.clone(),
|
||||
user_id,
|
||||
&ingestion_fingerprint,
|
||||
&embedding_backend_label,
|
||||
embedding_model_code.clone(),
|
||||
embedding_dimension,
|
||||
cache.ingestion_batch_size,
|
||||
cache.ingestion_max_retries,
|
||||
ingestion_config.clone(),
|
||||
)
|
||||
.await
|
||||
.context("ingesting missing slice paragraphs")?;
|
||||
for (request, shard) in ingest_requests.into_iter().zip(new_shards.into_iter()) {
|
||||
store.persist(&shard)?;
|
||||
records[request.slot] = Some(ParagraphShardRecord {
|
||||
shard,
|
||||
dirty: false,
|
||||
needs_reembed: false,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
for record in &mut records {
|
||||
let shard_record = record
|
||||
.as_mut()
|
||||
.context("shard record missing after ingestion run")?;
|
||||
if cache.refresh_embeddings_only || shard_record.needs_reembed {
|
||||
// Embeddings are now generated by the pipeline using FastEmbed - no need to re-embed
|
||||
shard_record.shard.ingestion_fingerprint = ingestion_fingerprint.clone();
|
||||
shard_record.shard.ingested_at = Utc::now();
|
||||
shard_record.shard.embedding_backend = embedding_backend_label.clone();
|
||||
shard_record.shard.embedding_model = embedding_model_code.clone();
|
||||
shard_record.shard.embedding_dimension = embedding_dimension;
|
||||
shard_record.dirty = true;
|
||||
shard_record.needs_reembed = false;
|
||||
}
|
||||
}
|
||||
|
||||
let mut record_index = HashMap::new();
|
||||
for (idx, plan_entry) in plan.iter().enumerate() {
|
||||
record_index.insert(plan_entry.entry.id.as_str(), idx);
|
||||
}
|
||||
|
||||
let mut corpus_paragraphs = Vec::with_capacity(plan.len());
|
||||
for record in &records {
|
||||
let shard = &record.as_ref().expect("record missing").shard;
|
||||
corpus_paragraphs.push(shard.to_corpus_paragraph());
|
||||
}
|
||||
|
||||
let mut corpus_questions = Vec::with_capacity(window.cases.len());
|
||||
for case in &window.cases {
|
||||
let slot = record_index
|
||||
.get(case.paragraph.id.as_str())
|
||||
.copied()
|
||||
.ok_or_else(|| {
|
||||
anyhow!(
|
||||
"slice case references paragraph '{}' that is not part of the window",
|
||||
case.paragraph.id
|
||||
)
|
||||
})?;
|
||||
let record_slot = records
|
||||
.get_mut(slot)
|
||||
.context("shard record slot missing for question binding")?;
|
||||
let record = record_slot
|
||||
.as_mut()
|
||||
.context("shard record missing for question binding")?;
|
||||
let (chunk_ids, updated) = match record.shard.ensure_question_binding(case.question) {
|
||||
Ok(result) => result,
|
||||
Err(err) => {
|
||||
if require_verified_chunks {
|
||||
return Err(err).context(format!(
|
||||
"locating answer text for question '{}' in paragraph '{}'",
|
||||
case.question.id, case.paragraph.id
|
||||
));
|
||||
}
|
||||
warn!(
|
||||
question_id = %case.question.id,
|
||||
paragraph_id = %case.paragraph.id,
|
||||
error = %err,
|
||||
"Failed to locate answer text in ingested content; recording empty chunk bindings"
|
||||
);
|
||||
record
|
||||
.shard
|
||||
.question_bindings
|
||||
.insert(case.question.id.clone(), Vec::new());
|
||||
record.dirty = true;
|
||||
(Vec::new(), true)
|
||||
}
|
||||
};
|
||||
if updated {
|
||||
record.dirty = true;
|
||||
}
|
||||
corpus_questions.push(CorpusQuestion {
|
||||
question_id: case.question.id.clone(),
|
||||
paragraph_id: case.paragraph.id.clone(),
|
||||
text_content_id: record.shard.text_content.get_id().to_string(),
|
||||
question_text: case.question.question.clone(),
|
||||
answers: case.question.answers.clone(),
|
||||
is_impossible: case.question.is_impossible,
|
||||
matching_chunk_ids: chunk_ids,
|
||||
});
|
||||
}
|
||||
|
||||
for entry in records.iter_mut().flatten() {
|
||||
if entry.dirty {
|
||||
store.persist(&entry.shard)?;
|
||||
}
|
||||
}
|
||||
|
||||
let manifest = CorpusManifest {
|
||||
version: MANIFEST_VERSION,
|
||||
metadata: CorpusMetadata {
|
||||
dataset_id: dataset.metadata.id.clone(),
|
||||
dataset_label: dataset.metadata.label.clone(),
|
||||
slice_id: slice.manifest.slice_id.clone(),
|
||||
include_unanswerable: slice.manifest.includes_unanswerable,
|
||||
require_verified_chunks: slice.manifest.require_verified_chunks,
|
||||
ingestion_fingerprint: ingestion_fingerprint.clone(),
|
||||
embedding_backend: embedding.backend_label().to_string(),
|
||||
embedding_model: embedding.model_code(),
|
||||
embedding_dimension: embedding.dimension(),
|
||||
converted_checksum: checksum,
|
||||
generated_at: Utc::now(),
|
||||
paragraph_count: corpus_paragraphs.len(),
|
||||
question_count: corpus_questions.len(),
|
||||
chunk_min_tokens: ingestion_config.tuning.chunk_min_tokens,
|
||||
chunk_max_tokens: ingestion_config.tuning.chunk_max_tokens,
|
||||
chunk_only: ingestion_config.chunk_only,
|
||||
},
|
||||
paragraphs: corpus_paragraphs,
|
||||
questions: corpus_questions,
|
||||
};
|
||||
|
||||
let ingested_count = stats.positive_ingested + stats.negative_ingested;
|
||||
let reused_ingestion = ingested_count == 0 && !cache.force_refresh;
|
||||
let reused_embeddings = reused_ingestion && !cache.refresh_embeddings_only;
|
||||
|
||||
info!(
|
||||
dataset = %dataset.metadata.id,
|
||||
slice = %slice.manifest.slice_id,
|
||||
fingerprint = %ingestion_fingerprint,
|
||||
reused_ingestion,
|
||||
reused_embeddings,
|
||||
positive_reused = stats.positive_reused,
|
||||
positive_ingested = stats.positive_ingested,
|
||||
negative_reused = stats.negative_reused,
|
||||
negative_ingested = stats.negative_ingested,
|
||||
shard_dir = %base_dir.display(),
|
||||
"Corpus cache outcome"
|
||||
);
|
||||
|
||||
let handle = CorpusHandle {
|
||||
manifest,
|
||||
path: base_dir,
|
||||
reused_ingestion,
|
||||
reused_embeddings,
|
||||
positive_reused: stats.positive_reused,
|
||||
positive_ingested: stats.positive_ingested,
|
||||
negative_reused: stats.negative_reused,
|
||||
negative_ingested: stats.negative_ingested,
|
||||
};
|
||||
|
||||
persist_manifest(&handle).context("persisting corpus manifest")?;
|
||||
|
||||
Ok(handle)
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
async fn ingest_paragraph_batch(
|
||||
dataset: &ConvertedDataset,
|
||||
targets: &[IngestRequest<'_>],
|
||||
embedding: Arc<common::utils::embedding::EmbeddingProvider>,
|
||||
openai: Arc<OpenAIClient>,
|
||||
user_id: &str,
|
||||
ingestion_fingerprint: &str,
|
||||
embedding_backend: &str,
|
||||
embedding_model: Option<String>,
|
||||
embedding_dimension: usize,
|
||||
batch_size: usize,
|
||||
max_retries: usize,
|
||||
ingestion_config: IngestionConfig,
|
||||
) -> Result<Vec<ParagraphShard>> {
|
||||
if targets.is_empty() {
|
||||
return Ok(Vec::new());
|
||||
}
|
||||
let namespace = format!("ingest_eval_{}", Uuid::new_v4());
|
||||
let db = create_ingest_db(&namespace).await?;
|
||||
db.apply_migrations()
|
||||
.await
|
||||
.context("applying migrations for ingestion")?;
|
||||
|
||||
let app_config = AppConfig {
|
||||
storage: StorageKind::Memory,
|
||||
..Default::default()
|
||||
};
|
||||
let backend: DynStore = Arc::new(InMemory::new());
|
||||
let storage = StorageManager::with_backend(backend, StorageKind::Memory);
|
||||
|
||||
let pipeline_config = ingestion_config.clone();
|
||||
let pipeline = IngestionPipeline::new_with_config(
|
||||
db,
|
||||
openai.clone(),
|
||||
app_config,
|
||||
None::<Arc<retrieval_pipeline::reranking::RerankerPool>>,
|
||||
storage,
|
||||
embedding.clone(),
|
||||
pipeline_config,
|
||||
)?;
|
||||
let pipeline = Arc::new(pipeline);
|
||||
|
||||
let mut shards = Vec::with_capacity(targets.len());
|
||||
let category = dataset.metadata.category.clone();
|
||||
for (batch_index, batch) in targets.chunks(batch_size).enumerate() {
|
||||
info!(
|
||||
batch = batch_index,
|
||||
batch_size = batch.len(),
|
||||
total_batches = targets.len().div_ceil(batch_size),
|
||||
"Ingesting paragraph batch"
|
||||
);
|
||||
let model_clone = embedding_model.clone();
|
||||
let backend_clone = embedding_backend.to_string();
|
||||
let pipeline_clone = pipeline.clone();
|
||||
let category_clone = category.clone();
|
||||
let tasks = batch.iter().cloned().map(move |request| {
|
||||
ingest_single_paragraph(
|
||||
pipeline_clone.clone(),
|
||||
request,
|
||||
category_clone.clone(),
|
||||
user_id,
|
||||
ingestion_fingerprint,
|
||||
backend_clone.clone(),
|
||||
model_clone.clone(),
|
||||
embedding_dimension,
|
||||
max_retries,
|
||||
ingestion_config.tuning.chunk_min_tokens,
|
||||
ingestion_config.tuning.chunk_max_tokens,
|
||||
ingestion_config.chunk_only,
|
||||
)
|
||||
});
|
||||
let batch_results: Vec<ParagraphShard> = try_join_all(tasks)
|
||||
.await
|
||||
.context("ingesting batch of paragraphs")?;
|
||||
shards.extend(batch_results);
|
||||
}
|
||||
|
||||
Ok(shards)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
async fn create_ingest_db(namespace: &str) -> Result<Arc<SurrealDbClient>> {
|
||||
let db = SurrealDbClient::memory(namespace, "corpus")
|
||||
.await
|
||||
.context("creating in-memory surrealdb for ingestion")?;
|
||||
Ok(Arc::new(db))
|
||||
}
|
||||
|
||||
#[cfg(not(test))]
|
||||
async fn create_ingest_db(namespace: &str) -> Result<Arc<SurrealDbClient>> {
|
||||
let config = get_config().context("loading app config for ingestion database")?;
|
||||
let db = SurrealDbClient::new(
|
||||
&config.surrealdb_address,
|
||||
&config.surrealdb_username,
|
||||
&config.surrealdb_password,
|
||||
namespace,
|
||||
"corpus",
|
||||
)
|
||||
.await
|
||||
.context("creating surrealdb database for ingestion")?;
|
||||
Ok(Arc::new(db))
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
async fn ingest_single_paragraph(
|
||||
pipeline: Arc<IngestionPipeline>,
|
||||
request: IngestRequest<'_>,
|
||||
category: String,
|
||||
user_id: &str,
|
||||
ingestion_fingerprint: &str,
|
||||
embedding_backend: String,
|
||||
embedding_model: Option<String>,
|
||||
embedding_dimension: usize,
|
||||
max_retries: usize,
|
||||
chunk_min_tokens: usize,
|
||||
chunk_max_tokens: usize,
|
||||
chunk_only: bool,
|
||||
) -> Result<ParagraphShard> {
|
||||
let paragraph = request.paragraph;
|
||||
let mut last_err: Option<anyhow::Error> = None;
|
||||
for attempt in 1..=max_retries {
|
||||
let payload = IngestionPayload::Text {
|
||||
text: paragraph.context.clone(),
|
||||
context: paragraph.title.clone(),
|
||||
category: category.clone(),
|
||||
user_id: user_id.to_string(),
|
||||
};
|
||||
let task = IngestionTask::new(payload, user_id.to_string());
|
||||
match pipeline.produce_artifacts(&task).await {
|
||||
Ok(artifacts) => {
|
||||
let entities: Vec<EmbeddedKnowledgeEntity> = artifacts
|
||||
.entities
|
||||
.into_iter()
|
||||
.map(|e| EmbeddedKnowledgeEntity {
|
||||
entity: e.entity,
|
||||
embedding: e.embedding,
|
||||
})
|
||||
.collect();
|
||||
let chunks: Vec<EmbeddedTextChunk> = artifacts
|
||||
.chunks
|
||||
.into_iter()
|
||||
.map(|c| EmbeddedTextChunk {
|
||||
chunk: c.chunk,
|
||||
embedding: c.embedding,
|
||||
})
|
||||
.collect();
|
||||
// No need to reembed - pipeline now uses FastEmbed internally
|
||||
let mut shard = ParagraphShard::new(
|
||||
paragraph,
|
||||
request.shard_path,
|
||||
ingestion_fingerprint,
|
||||
artifacts.text_content,
|
||||
entities,
|
||||
artifacts.relationships,
|
||||
chunks,
|
||||
&embedding_backend,
|
||||
embedding_model.clone(),
|
||||
embedding_dimension,
|
||||
chunk_min_tokens,
|
||||
chunk_max_tokens,
|
||||
chunk_only,
|
||||
);
|
||||
for question in &request.question_refs {
|
||||
if let Err(err) = shard.ensure_question_binding(question) {
|
||||
warn!(
|
||||
question_id = %question.id,
|
||||
paragraph_id = %paragraph.id,
|
||||
error = %err,
|
||||
"Failed to locate answer text in ingested content; recording empty chunk bindings"
|
||||
);
|
||||
shard
|
||||
.question_bindings
|
||||
.insert(question.id.clone(), Vec::new());
|
||||
}
|
||||
}
|
||||
return Ok(shard);
|
||||
}
|
||||
Err(err) => {
|
||||
warn!(
|
||||
paragraph_id = %paragraph.id,
|
||||
attempt,
|
||||
max_attempts = max_retries,
|
||||
error = ?err,
|
||||
"ingestion attempt failed for paragraph; retrying"
|
||||
);
|
||||
last_err = Some(err.into());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Err(last_err
|
||||
.unwrap_or_else(|| anyhow!("ingestion failed"))
|
||||
.context(format!("running ingestion for paragraph {}", paragraph.id)))
|
||||
}
|
||||
|
||||
pub fn cached_corpus_dir(cache: &CorpusCacheConfig, dataset_id: &str, slice_id: &str) -> PathBuf {
|
||||
cache.ingestion_cache_dir.join(dataset_id).join(slice_id)
|
||||
}
|
||||
|
||||
pub fn build_ingestion_fingerprint(
|
||||
dataset: &ConvertedDataset,
|
||||
slice: &ResolvedSlice<'_>,
|
||||
checksum: &str,
|
||||
ingestion_config: &IngestionConfig,
|
||||
) -> String {
|
||||
let config_repr = format!("{:?}", ingestion_config);
|
||||
let mut hasher = Sha256::new();
|
||||
hasher.update(config_repr.as_bytes());
|
||||
let config_hash = format!("{:x}", hasher.finalize());
|
||||
|
||||
format!(
|
||||
"v{INGESTION_SPEC_VERSION}:{}:{}:{}:{}:{}",
|
||||
dataset.metadata.id,
|
||||
slice.manifest.slice_id,
|
||||
slice.manifest.includes_unanswerable,
|
||||
checksum,
|
||||
config_hash
|
||||
)
|
||||
}
|
||||
|
||||
pub fn compute_ingestion_fingerprint(
|
||||
dataset: &ConvertedDataset,
|
||||
slice: &ResolvedSlice<'_>,
|
||||
converted_path: &Path,
|
||||
ingestion_config: &IngestionConfig,
|
||||
) -> Result<String> {
|
||||
let checksum = compute_file_checksum(converted_path)?;
|
||||
Ok(build_ingestion_fingerprint(
|
||||
dataset,
|
||||
slice,
|
||||
&checksum,
|
||||
ingestion_config,
|
||||
))
|
||||
}
|
||||
|
||||
pub fn load_cached_manifest(base_dir: &Path) -> Result<Option<CorpusManifest>> {
|
||||
let path = base_dir.join("manifest.json");
|
||||
if !path.exists() {
|
||||
return Ok(None);
|
||||
}
|
||||
let mut file = fs::File::open(&path)
|
||||
.with_context(|| format!("opening cached manifest {}", path.display()))?;
|
||||
let mut buf = Vec::new();
|
||||
file.read_to_end(&mut buf)
|
||||
.with_context(|| format!("reading cached manifest {}", path.display()))?;
|
||||
let manifest: CorpusManifest = serde_json::from_slice(&buf)
|
||||
.with_context(|| format!("deserialising cached manifest {}", path.display()))?;
|
||||
Ok(Some(manifest))
|
||||
}
|
||||
|
||||
fn persist_manifest(handle: &CorpusHandle) -> Result<()> {
|
||||
let path = handle.path.join("manifest.json");
|
||||
if let Some(parent) = path.parent() {
|
||||
fs::create_dir_all(parent)
|
||||
.with_context(|| format!("creating manifest directory {}", parent.display()))?;
|
||||
}
|
||||
let tmp_path = path.with_extension("json.tmp");
|
||||
let blob =
|
||||
serde_json::to_vec_pretty(&handle.manifest).context("serialising corpus manifest")?;
|
||||
fs::write(&tmp_path, &blob)
|
||||
.with_context(|| format!("writing temporary manifest {}", tmp_path.display()))?;
|
||||
fs::rename(&tmp_path, &path)
|
||||
.with_context(|| format!("replacing manifest {}", path.display()))?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn corpus_handle_from_manifest(manifest: CorpusManifest, base_dir: PathBuf) -> CorpusHandle {
|
||||
CorpusHandle {
|
||||
manifest,
|
||||
path: base_dir,
|
||||
reused_ingestion: true,
|
||||
reused_embeddings: true,
|
||||
positive_reused: 0,
|
||||
positive_ingested: 0,
|
||||
negative_reused: 0,
|
||||
negative_ingested: 0,
|
||||
}
|
||||
}
|
||||
|
||||
fn compute_file_checksum(path: &Path) -> Result<String> {
|
||||
let mut file = fs::File::open(path)
|
||||
.with_context(|| format!("opening file {} for checksum", path.display()))?;
|
||||
let mut hasher = Sha256::new();
|
||||
let mut buffer = [0u8; 8192];
|
||||
loop {
|
||||
let read = file
|
||||
.read(&mut buffer)
|
||||
.with_context(|| format!("reading {} for checksum", path.display()))?;
|
||||
if read == 0 {
|
||||
break;
|
||||
}
|
||||
hasher.update(&buffer[..read]);
|
||||
}
|
||||
Ok(format!("{:x}", hasher.finalize()))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::{
|
||||
datasets::{ConvertedDataset, ConvertedParagraph, ConvertedQuestion, DatasetKind},
|
||||
slice::{CaseRef, SliceCaseEntry, SliceManifest, SliceParagraphEntry, SliceParagraphKind},
|
||||
};
|
||||
use chrono::Utc;
|
||||
|
||||
fn dummy_dataset() -> ConvertedDataset {
|
||||
let question = ConvertedQuestion {
|
||||
id: "q1".to_string(),
|
||||
question: "What?".to_string(),
|
||||
answers: vec!["A".to_string()],
|
||||
is_impossible: false,
|
||||
};
|
||||
let paragraph = ConvertedParagraph {
|
||||
id: "p1".to_string(),
|
||||
title: "title".to_string(),
|
||||
context: "context".to_string(),
|
||||
questions: vec![question],
|
||||
};
|
||||
|
||||
ConvertedDataset {
|
||||
generated_at: Utc::now(),
|
||||
metadata: crate::datasets::DatasetMetadata::for_kind(
|
||||
DatasetKind::default(),
|
||||
false,
|
||||
None,
|
||||
),
|
||||
source: "src".to_string(),
|
||||
paragraphs: vec![paragraph],
|
||||
}
|
||||
}
|
||||
|
||||
fn dummy_slice<'a>(dataset: &'a ConvertedDataset) -> ResolvedSlice<'a> {
|
||||
let paragraph = &dataset.paragraphs[0];
|
||||
let question = ¶graph.questions[0];
|
||||
let manifest = SliceManifest {
|
||||
version: 1,
|
||||
slice_id: "slice-1".to_string(),
|
||||
dataset_id: dataset.metadata.id.clone(),
|
||||
dataset_label: dataset.metadata.label.clone(),
|
||||
dataset_source: dataset.source.clone(),
|
||||
includes_unanswerable: false,
|
||||
require_verified_chunks: false,
|
||||
seed: 1,
|
||||
requested_limit: Some(1),
|
||||
requested_corpus: 1,
|
||||
generated_at: Utc::now(),
|
||||
case_count: 1,
|
||||
positive_paragraphs: 1,
|
||||
negative_paragraphs: 0,
|
||||
total_paragraphs: 1,
|
||||
negative_multiplier: 1.0,
|
||||
cases: vec![SliceCaseEntry {
|
||||
question_id: question.id.clone(),
|
||||
paragraph_id: paragraph.id.clone(),
|
||||
}],
|
||||
paragraphs: vec![SliceParagraphEntry {
|
||||
id: paragraph.id.clone(),
|
||||
kind: SliceParagraphKind::Positive {
|
||||
question_ids: vec![question.id.clone()],
|
||||
},
|
||||
shard_path: None,
|
||||
}],
|
||||
};
|
||||
|
||||
ResolvedSlice {
|
||||
manifest,
|
||||
path: PathBuf::from("cache"),
|
||||
paragraphs: dataset.paragraphs.iter().collect(),
|
||||
cases: vec![CaseRef {
|
||||
paragraph,
|
||||
question,
|
||||
}],
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fingerprint_changes_with_chunk_settings() {
|
||||
let dataset = dummy_dataset();
|
||||
let slice = dummy_slice(&dataset);
|
||||
let checksum = "deadbeef";
|
||||
|
||||
let base_config = IngestionConfig::default();
|
||||
let fp_base = build_ingestion_fingerprint(&dataset, &slice, checksum, &base_config);
|
||||
|
||||
let mut token_config = base_config.clone();
|
||||
token_config.tuning.chunk_min_tokens += 1;
|
||||
let fp_token = build_ingestion_fingerprint(&dataset, &slice, checksum, &token_config);
|
||||
assert_ne!(fp_base, fp_token, "token bounds should affect fingerprint");
|
||||
|
||||
let mut chunk_only_config = base_config;
|
||||
chunk_only_config.chunk_only = true;
|
||||
let fp_chunk_only =
|
||||
build_ingestion_fingerprint(&dataset, &slice, checksum, &chunk_only_config);
|
||||
assert_ne!(
|
||||
fp_base, fp_chunk_only,
|
||||
"chunk-only mode should affect fingerprint"
|
||||
);
|
||||
}
|
||||
}
|
||||
934
evaluations/src/corpus/store.rs
Normal file
934
evaluations/src/corpus/store.rs
Normal file
@@ -0,0 +1,934 @@
|
||||
use std::{
|
||||
collections::{HashMap, HashSet},
|
||||
fs,
|
||||
io::BufReader,
|
||||
path::PathBuf,
|
||||
};
|
||||
|
||||
use anyhow::{anyhow, Context, Result};
|
||||
use chrono::{DateTime, Utc};
|
||||
use common::storage::types::StoredObject;
|
||||
use common::storage::{
|
||||
db::SurrealDbClient,
|
||||
types::{
|
||||
knowledge_entity::KnowledgeEntity,
|
||||
knowledge_entity_embedding::KnowledgeEntityEmbedding,
|
||||
knowledge_relationship::{KnowledgeRelationship, RelationshipMetadata},
|
||||
text_chunk::TextChunk,
|
||||
text_chunk_embedding::TextChunkEmbedding,
|
||||
text_content::TextContent,
|
||||
},
|
||||
};
|
||||
use serde::Deserialize;
|
||||
use serde::Serialize;
|
||||
use surrealdb::sql::Thing;
|
||||
use tracing::{debug, warn};
|
||||
|
||||
use crate::datasets::{ConvertedParagraph, ConvertedQuestion};
|
||||
|
||||
pub const MANIFEST_VERSION: u32 = 3;
|
||||
pub const PARAGRAPH_SHARD_VERSION: u32 = 3;
|
||||
const MANIFEST_BATCH_SIZE: usize = 100;
|
||||
const MANIFEST_MAX_BYTES_PER_BATCH: usize = 300_000; // default cap for non-text batches
|
||||
const TEXT_CONTENT_MAX_BYTES_PER_BATCH: usize = 250_000; // text bodies can be large; limit aggressively
|
||||
const MAX_BATCHES_PER_REQUEST: usize = 24;
|
||||
const REQUEST_MAX_BYTES: usize = 800_000; // total payload cap per Surreal query request
|
||||
|
||||
fn current_manifest_version() -> u32 {
|
||||
MANIFEST_VERSION
|
||||
}
|
||||
|
||||
fn current_paragraph_shard_version() -> u32 {
|
||||
PARAGRAPH_SHARD_VERSION
|
||||
}
|
||||
|
||||
fn default_chunk_min_tokens() -> usize {
|
||||
500
|
||||
}
|
||||
|
||||
fn default_chunk_max_tokens() -> usize {
|
||||
2_000
|
||||
}
|
||||
|
||||
fn default_chunk_only() -> bool {
|
||||
false
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
|
||||
pub struct EmbeddedKnowledgeEntity {
|
||||
pub entity: KnowledgeEntity,
|
||||
pub embedding: Vec<f32>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
|
||||
pub struct EmbeddedTextChunk {
|
||||
pub chunk: TextChunk,
|
||||
pub embedding: Vec<f32>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, serde::Deserialize)]
|
||||
struct LegacyKnowledgeEntity {
|
||||
#[serde(flatten)]
|
||||
pub entity: KnowledgeEntity,
|
||||
#[serde(default)]
|
||||
pub embedding: Vec<f32>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, serde::Deserialize)]
|
||||
struct LegacyTextChunk {
|
||||
#[serde(flatten)]
|
||||
pub chunk: TextChunk,
|
||||
#[serde(default)]
|
||||
pub embedding: Vec<f32>,
|
||||
}
|
||||
|
||||
fn deserialize_embedded_entities<'de, D>(
|
||||
deserializer: D,
|
||||
) -> Result<Vec<EmbeddedKnowledgeEntity>, D::Error>
|
||||
where
|
||||
D: serde::Deserializer<'de>,
|
||||
{
|
||||
#[derive(serde::Deserialize)]
|
||||
#[serde(untagged)]
|
||||
enum EntityInput {
|
||||
Embedded(Vec<EmbeddedKnowledgeEntity>),
|
||||
Legacy(Vec<LegacyKnowledgeEntity>),
|
||||
}
|
||||
|
||||
match EntityInput::deserialize(deserializer)? {
|
||||
EntityInput::Embedded(items) => Ok(items),
|
||||
EntityInput::Legacy(items) => Ok(items
|
||||
.into_iter()
|
||||
.map(|legacy| EmbeddedKnowledgeEntity {
|
||||
entity: legacy.entity,
|
||||
embedding: legacy.embedding,
|
||||
})
|
||||
.collect()),
|
||||
}
|
||||
}
|
||||
|
||||
fn deserialize_embedded_chunks<'de, D>(deserializer: D) -> Result<Vec<EmbeddedTextChunk>, D::Error>
|
||||
where
|
||||
D: serde::Deserializer<'de>,
|
||||
{
|
||||
#[derive(serde::Deserialize)]
|
||||
#[serde(untagged)]
|
||||
enum ChunkInput {
|
||||
Embedded(Vec<EmbeddedTextChunk>),
|
||||
Legacy(Vec<LegacyTextChunk>),
|
||||
}
|
||||
|
||||
match ChunkInput::deserialize(deserializer)? {
|
||||
ChunkInput::Embedded(items) => Ok(items),
|
||||
ChunkInput::Legacy(items) => Ok(items
|
||||
.into_iter()
|
||||
.map(|legacy| EmbeddedTextChunk {
|
||||
chunk: legacy.chunk,
|
||||
embedding: legacy.embedding,
|
||||
})
|
||||
.collect()),
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
|
||||
pub struct CorpusManifest {
|
||||
#[serde(default = "current_manifest_version")]
|
||||
pub version: u32,
|
||||
pub metadata: CorpusMetadata,
|
||||
pub paragraphs: Vec<CorpusParagraph>,
|
||||
pub questions: Vec<CorpusQuestion>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
|
||||
pub struct CorpusMetadata {
|
||||
pub dataset_id: String,
|
||||
pub dataset_label: String,
|
||||
pub slice_id: String,
|
||||
pub include_unanswerable: bool,
|
||||
#[serde(default)]
|
||||
pub require_verified_chunks: bool,
|
||||
pub ingestion_fingerprint: String,
|
||||
pub embedding_backend: String,
|
||||
pub embedding_model: Option<String>,
|
||||
pub embedding_dimension: usize,
|
||||
pub converted_checksum: String,
|
||||
pub generated_at: DateTime<Utc>,
|
||||
pub paragraph_count: usize,
|
||||
pub question_count: usize,
|
||||
#[serde(default = "default_chunk_min_tokens")]
|
||||
pub chunk_min_tokens: usize,
|
||||
#[serde(default = "default_chunk_max_tokens")]
|
||||
pub chunk_max_tokens: usize,
|
||||
#[serde(default = "default_chunk_only")]
|
||||
pub chunk_only: bool,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
|
||||
pub struct CorpusParagraph {
|
||||
pub paragraph_id: String,
|
||||
pub title: String,
|
||||
pub text_content: TextContent,
|
||||
#[serde(deserialize_with = "deserialize_embedded_entities")]
|
||||
pub entities: Vec<EmbeddedKnowledgeEntity>,
|
||||
pub relationships: Vec<KnowledgeRelationship>,
|
||||
#[serde(deserialize_with = "deserialize_embedded_chunks")]
|
||||
pub chunks: Vec<EmbeddedTextChunk>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
|
||||
pub struct CorpusQuestion {
|
||||
pub question_id: String,
|
||||
pub paragraph_id: String,
|
||||
pub text_content_id: String,
|
||||
pub question_text: String,
|
||||
pub answers: Vec<String>,
|
||||
pub is_impossible: bool,
|
||||
pub matching_chunk_ids: Vec<String>,
|
||||
}
|
||||
|
||||
pub struct CorpusHandle {
|
||||
pub manifest: CorpusManifest,
|
||||
pub path: PathBuf,
|
||||
pub reused_ingestion: bool,
|
||||
pub reused_embeddings: bool,
|
||||
pub positive_reused: usize,
|
||||
pub positive_ingested: usize,
|
||||
pub negative_reused: usize,
|
||||
pub negative_ingested: usize,
|
||||
}
|
||||
|
||||
pub fn window_manifest(
|
||||
manifest: &CorpusManifest,
|
||||
offset: usize,
|
||||
length: usize,
|
||||
negative_multiplier: f32,
|
||||
) -> Result<CorpusManifest> {
|
||||
let total = manifest.questions.len();
|
||||
if total == 0 {
|
||||
return Err(anyhow!(
|
||||
"manifest contains no questions; cannot select a window"
|
||||
));
|
||||
}
|
||||
if offset >= total {
|
||||
return Err(anyhow!(
|
||||
"window offset {} exceeds manifest questions ({})",
|
||||
offset,
|
||||
total
|
||||
));
|
||||
}
|
||||
let end = (offset + length).min(total);
|
||||
let questions = manifest.questions[offset..end].to_vec();
|
||||
|
||||
let selected_positive_ids: HashSet<_> =
|
||||
questions.iter().map(|q| q.paragraph_id.clone()).collect();
|
||||
let positives_all: HashSet<_> = manifest
|
||||
.questions
|
||||
.iter()
|
||||
.map(|q| q.paragraph_id.as_str())
|
||||
.collect();
|
||||
let available_negatives = manifest
|
||||
.paragraphs
|
||||
.len()
|
||||
.saturating_sub(positives_all.len());
|
||||
let desired_negatives =
|
||||
((selected_positive_ids.len() as f32) * negative_multiplier).ceil() as usize;
|
||||
let desired_negatives = desired_negatives.min(available_negatives);
|
||||
|
||||
let mut paragraphs = Vec::new();
|
||||
let mut negative_count = 0usize;
|
||||
for paragraph in &manifest.paragraphs {
|
||||
if selected_positive_ids.contains(¶graph.paragraph_id) {
|
||||
paragraphs.push(paragraph.clone());
|
||||
} else if negative_count < desired_negatives {
|
||||
paragraphs.push(paragraph.clone());
|
||||
negative_count += 1;
|
||||
}
|
||||
}
|
||||
|
||||
let mut narrowed = manifest.clone();
|
||||
narrowed.questions = questions;
|
||||
narrowed.paragraphs = paragraphs;
|
||||
narrowed.metadata.paragraph_count = narrowed.paragraphs.len();
|
||||
narrowed.metadata.question_count = narrowed.questions.len();
|
||||
|
||||
Ok(narrowed)
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
struct RelationInsert {
|
||||
#[serde(rename = "in")]
|
||||
pub in_: Thing,
|
||||
#[serde(rename = "out")]
|
||||
pub out: Thing,
|
||||
pub id: String,
|
||||
pub metadata: RelationshipMetadata,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct SizedBatch<T> {
|
||||
approx_bytes: usize,
|
||||
items: Vec<T>,
|
||||
}
|
||||
|
||||
struct ManifestBatches {
|
||||
text_contents: Vec<SizedBatch<TextContent>>,
|
||||
entities: Vec<SizedBatch<KnowledgeEntity>>,
|
||||
entity_embeddings: Vec<SizedBatch<KnowledgeEntityEmbedding>>,
|
||||
relationships: Vec<SizedBatch<RelationInsert>>,
|
||||
chunks: Vec<SizedBatch<TextChunk>>,
|
||||
chunk_embeddings: Vec<SizedBatch<TextChunkEmbedding>>,
|
||||
}
|
||||
|
||||
fn build_manifest_batches(manifest: &CorpusManifest) -> Result<ManifestBatches> {
|
||||
let mut text_contents = Vec::new();
|
||||
let mut entities = Vec::new();
|
||||
let mut entity_embeddings = Vec::new();
|
||||
let mut relationships = Vec::new();
|
||||
let mut chunks = Vec::new();
|
||||
let mut chunk_embeddings = Vec::new();
|
||||
|
||||
let mut seen_text_content = HashSet::new();
|
||||
let mut seen_entities = HashSet::new();
|
||||
let mut seen_relationships = HashSet::new();
|
||||
let mut seen_chunks = HashSet::new();
|
||||
|
||||
for paragraph in &manifest.paragraphs {
|
||||
if seen_text_content.insert(paragraph.text_content.id.clone()) {
|
||||
text_contents.push(paragraph.text_content.clone());
|
||||
}
|
||||
|
||||
for embedded_entity in ¶graph.entities {
|
||||
if seen_entities.insert(embedded_entity.entity.id.clone()) {
|
||||
let entity = embedded_entity.entity.clone();
|
||||
entities.push(entity.clone());
|
||||
entity_embeddings.push(KnowledgeEntityEmbedding::new(
|
||||
&entity.id,
|
||||
embedded_entity.embedding.clone(),
|
||||
entity.user_id.clone(),
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
for relationship in ¶graph.relationships {
|
||||
if seen_relationships.insert(relationship.id.clone()) {
|
||||
let table = KnowledgeEntity::table_name();
|
||||
let in_id = relationship
|
||||
.in_
|
||||
.strip_prefix(&format!("{table}:"))
|
||||
.unwrap_or(&relationship.in_);
|
||||
let out_id = relationship
|
||||
.out
|
||||
.strip_prefix(&format!("{table}:"))
|
||||
.unwrap_or(&relationship.out);
|
||||
let in_thing = Thing::from((table, in_id));
|
||||
let out_thing = Thing::from((table, out_id));
|
||||
relationships.push(RelationInsert {
|
||||
in_: in_thing,
|
||||
out: out_thing,
|
||||
id: relationship.id.clone(),
|
||||
metadata: relationship.metadata.clone(),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
for embedded_chunk in ¶graph.chunks {
|
||||
if seen_chunks.insert(embedded_chunk.chunk.id.clone()) {
|
||||
let chunk = embedded_chunk.chunk.clone();
|
||||
chunks.push(chunk.clone());
|
||||
chunk_embeddings.push(TextChunkEmbedding::new(
|
||||
&chunk.id,
|
||||
chunk.source_id.clone(),
|
||||
embedded_chunk.embedding.clone(),
|
||||
chunk.user_id.clone(),
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(ManifestBatches {
|
||||
text_contents: chunk_items(
|
||||
&text_contents,
|
||||
MANIFEST_BATCH_SIZE,
|
||||
TEXT_CONTENT_MAX_BYTES_PER_BATCH,
|
||||
)
|
||||
.context("chunking text_content payloads")?,
|
||||
entities: chunk_items(&entities, MANIFEST_BATCH_SIZE, MANIFEST_MAX_BYTES_PER_BATCH)
|
||||
.context("chunking knowledge_entity payloads")?,
|
||||
entity_embeddings: chunk_items(
|
||||
&entity_embeddings,
|
||||
MANIFEST_BATCH_SIZE,
|
||||
MANIFEST_MAX_BYTES_PER_BATCH,
|
||||
)
|
||||
.context("chunking knowledge_entity_embedding payloads")?,
|
||||
relationships: chunk_items(
|
||||
&relationships,
|
||||
MANIFEST_BATCH_SIZE,
|
||||
MANIFEST_MAX_BYTES_PER_BATCH,
|
||||
)
|
||||
.context("chunking relationship payloads")?,
|
||||
chunks: chunk_items(&chunks, MANIFEST_BATCH_SIZE, MANIFEST_MAX_BYTES_PER_BATCH)
|
||||
.context("chunking text_chunk payloads")?,
|
||||
chunk_embeddings: chunk_items(
|
||||
&chunk_embeddings,
|
||||
MANIFEST_BATCH_SIZE,
|
||||
MANIFEST_MAX_BYTES_PER_BATCH,
|
||||
)
|
||||
.context("chunking text_chunk_embedding payloads")?,
|
||||
})
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
|
||||
pub struct ParagraphShard {
|
||||
#[serde(default = "current_paragraph_shard_version")]
|
||||
pub version: u32,
|
||||
pub paragraph_id: String,
|
||||
pub shard_path: String,
|
||||
pub ingestion_fingerprint: String,
|
||||
pub ingested_at: DateTime<Utc>,
|
||||
pub title: String,
|
||||
pub text_content: TextContent,
|
||||
#[serde(deserialize_with = "deserialize_embedded_entities")]
|
||||
pub entities: Vec<EmbeddedKnowledgeEntity>,
|
||||
pub relationships: Vec<KnowledgeRelationship>,
|
||||
#[serde(deserialize_with = "deserialize_embedded_chunks")]
|
||||
pub chunks: Vec<EmbeddedTextChunk>,
|
||||
#[serde(default)]
|
||||
pub question_bindings: HashMap<String, Vec<String>>,
|
||||
#[serde(default)]
|
||||
pub embedding_backend: String,
|
||||
#[serde(default)]
|
||||
pub embedding_model: Option<String>,
|
||||
#[serde(default)]
|
||||
pub embedding_dimension: usize,
|
||||
#[serde(default = "default_chunk_min_tokens")]
|
||||
pub chunk_min_tokens: usize,
|
||||
#[serde(default = "default_chunk_max_tokens")]
|
||||
pub chunk_max_tokens: usize,
|
||||
#[serde(default = "default_chunk_only")]
|
||||
pub chunk_only: bool,
|
||||
}
|
||||
|
||||
pub struct ParagraphShardStore {
|
||||
base_dir: PathBuf,
|
||||
}
|
||||
|
||||
impl ParagraphShardStore {
|
||||
pub fn new(base_dir: PathBuf) -> Self {
|
||||
Self { base_dir }
|
||||
}
|
||||
|
||||
pub fn ensure_base_dir(&self) -> Result<()> {
|
||||
fs::create_dir_all(&self.base_dir)
|
||||
.with_context(|| format!("creating shard base dir {}", self.base_dir.display()))
|
||||
}
|
||||
|
||||
fn resolve(&self, relative: &str) -> PathBuf {
|
||||
self.base_dir.join(relative)
|
||||
}
|
||||
|
||||
pub fn load(&self, relative: &str, fingerprint: &str) -> Result<Option<ParagraphShard>> {
|
||||
let path = self.resolve(relative);
|
||||
let file = match fs::File::open(&path) {
|
||||
Ok(file) => file,
|
||||
Err(err) if err.kind() == std::io::ErrorKind::NotFound => return Ok(None),
|
||||
Err(err) => {
|
||||
return Err(err).with_context(|| format!("opening shard {}", path.display()))
|
||||
}
|
||||
};
|
||||
let reader = BufReader::new(file);
|
||||
let mut shard: ParagraphShard = serde_json::from_reader(reader)
|
||||
.with_context(|| format!("parsing shard {}", path.display()))?;
|
||||
|
||||
if shard.ingestion_fingerprint != fingerprint {
|
||||
debug!(
|
||||
path = %path.display(),
|
||||
expected = fingerprint,
|
||||
found = shard.ingestion_fingerprint,
|
||||
"Shard fingerprint mismatch; will rebuild"
|
||||
);
|
||||
return Ok(None);
|
||||
}
|
||||
if shard.version != PARAGRAPH_SHARD_VERSION {
|
||||
warn!(
|
||||
path = %path.display(),
|
||||
version = shard.version,
|
||||
expected = PARAGRAPH_SHARD_VERSION,
|
||||
"Upgrading shard to current version"
|
||||
);
|
||||
shard.version = PARAGRAPH_SHARD_VERSION;
|
||||
}
|
||||
shard.shard_path = relative.to_string();
|
||||
Ok(Some(shard))
|
||||
}
|
||||
|
||||
pub fn persist(&self, shard: &ParagraphShard) -> Result<()> {
|
||||
let mut shard = shard.clone();
|
||||
shard.version = PARAGRAPH_SHARD_VERSION;
|
||||
|
||||
let path = self.resolve(&shard.shard_path);
|
||||
if let Some(parent) = path.parent() {
|
||||
fs::create_dir_all(parent)
|
||||
.with_context(|| format!("creating shard dir {}", parent.display()))?;
|
||||
}
|
||||
let tmp_path = path.with_extension("json.tmp");
|
||||
let body = serde_json::to_vec_pretty(&shard).context("serialising paragraph shard")?;
|
||||
fs::write(&tmp_path, &body)
|
||||
.with_context(|| format!("writing shard tmp {}", tmp_path.display()))?;
|
||||
fs::rename(&tmp_path, &path)
|
||||
.with_context(|| format!("renaming shard tmp {}", path.display()))?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl ParagraphShard {
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub fn new(
|
||||
paragraph: &ConvertedParagraph,
|
||||
shard_path: String,
|
||||
ingestion_fingerprint: &str,
|
||||
text_content: TextContent,
|
||||
entities: Vec<EmbeddedKnowledgeEntity>,
|
||||
relationships: Vec<KnowledgeRelationship>,
|
||||
chunks: Vec<EmbeddedTextChunk>,
|
||||
embedding_backend: &str,
|
||||
embedding_model: Option<String>,
|
||||
embedding_dimension: usize,
|
||||
chunk_min_tokens: usize,
|
||||
chunk_max_tokens: usize,
|
||||
chunk_only: bool,
|
||||
) -> Self {
|
||||
Self {
|
||||
version: PARAGRAPH_SHARD_VERSION,
|
||||
paragraph_id: paragraph.id.clone(),
|
||||
shard_path,
|
||||
ingestion_fingerprint: ingestion_fingerprint.to_string(),
|
||||
ingested_at: Utc::now(),
|
||||
title: paragraph.title.clone(),
|
||||
text_content,
|
||||
entities,
|
||||
relationships,
|
||||
chunks,
|
||||
question_bindings: HashMap::new(),
|
||||
embedding_backend: embedding_backend.to_string(),
|
||||
embedding_model,
|
||||
embedding_dimension,
|
||||
chunk_min_tokens,
|
||||
chunk_max_tokens,
|
||||
chunk_only,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn to_corpus_paragraph(&self) -> CorpusParagraph {
|
||||
CorpusParagraph {
|
||||
paragraph_id: self.paragraph_id.clone(),
|
||||
title: self.title.clone(),
|
||||
text_content: self.text_content.clone(),
|
||||
entities: self.entities.clone(),
|
||||
relationships: self.relationships.clone(),
|
||||
chunks: self.chunks.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn ensure_question_binding(
|
||||
&mut self,
|
||||
question: &ConvertedQuestion,
|
||||
) -> Result<(Vec<String>, bool)> {
|
||||
if let Some(existing) = self.question_bindings.get(&question.id) {
|
||||
return Ok((existing.clone(), false));
|
||||
}
|
||||
let chunk_ids = validate_answers(&self.text_content, &self.chunks, question)?;
|
||||
self.question_bindings
|
||||
.insert(question.id.clone(), chunk_ids.clone());
|
||||
Ok((chunk_ids, true))
|
||||
}
|
||||
}
|
||||
|
||||
fn validate_answers(
|
||||
content: &TextContent,
|
||||
chunks: &[EmbeddedTextChunk],
|
||||
question: &ConvertedQuestion,
|
||||
) -> Result<Vec<String>> {
|
||||
if question.is_impossible || question.answers.is_empty() {
|
||||
return Ok(Vec::new());
|
||||
}
|
||||
|
||||
let mut matches = std::collections::BTreeSet::new();
|
||||
let mut found_any = false;
|
||||
let haystack = content.text.to_ascii_lowercase();
|
||||
let haystack_norm = normalize_answer_text(&haystack);
|
||||
for answer in &question.answers {
|
||||
let needle: String = answer.to_ascii_lowercase();
|
||||
let needle_norm = normalize_answer_text(&needle);
|
||||
let text_match = haystack.contains(&needle)
|
||||
|| (!needle_norm.is_empty() && haystack_norm.contains(&needle_norm));
|
||||
if text_match {
|
||||
found_any = true;
|
||||
}
|
||||
for chunk in chunks {
|
||||
let chunk_text = chunk.chunk.chunk.to_ascii_lowercase();
|
||||
let chunk_norm = normalize_answer_text(&chunk_text);
|
||||
if chunk_text.contains(&needle)
|
||||
|| (!needle_norm.is_empty() && chunk_norm.contains(&needle_norm))
|
||||
{
|
||||
matches.insert(chunk.chunk.get_id().to_string());
|
||||
found_any = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if !found_any {
|
||||
Err(anyhow!(
|
||||
"expected answer for question '{}' was not found in ingested content",
|
||||
question.id
|
||||
))
|
||||
} else {
|
||||
Ok(matches.into_iter().collect())
|
||||
}
|
||||
}
|
||||
|
||||
fn normalize_answer_text(text: &str) -> String {
|
||||
text.chars()
|
||||
.map(|ch| {
|
||||
if ch.is_alphanumeric() || ch.is_whitespace() {
|
||||
ch.to_ascii_lowercase()
|
||||
} else {
|
||||
' '
|
||||
}
|
||||
})
|
||||
.collect::<String>()
|
||||
.split_whitespace()
|
||||
.collect::<Vec<_>>()
|
||||
.join(" ")
|
||||
}
|
||||
|
||||
fn chunk_items<T: Clone + Serialize>(
|
||||
items: &[T],
|
||||
max_items: usize,
|
||||
max_bytes: usize,
|
||||
) -> Result<Vec<SizedBatch<T>>> {
|
||||
if items.is_empty() {
|
||||
return Ok(Vec::new());
|
||||
}
|
||||
|
||||
let mut batches = Vec::new();
|
||||
let mut current = Vec::new();
|
||||
let mut current_bytes = 0usize;
|
||||
|
||||
for item in items {
|
||||
let size = serde_json::to_vec(item)
|
||||
.map(|buf| buf.len())
|
||||
.context("serialising batch item for sizing")?;
|
||||
|
||||
let would_overflow_items = !current.is_empty() && current.len() >= max_items;
|
||||
let would_overflow_bytes = !current.is_empty() && current_bytes + size > max_bytes;
|
||||
|
||||
if would_overflow_items || would_overflow_bytes {
|
||||
batches.push(SizedBatch {
|
||||
approx_bytes: current_bytes.max(1),
|
||||
items: std::mem::take(&mut current),
|
||||
});
|
||||
current_bytes = 0;
|
||||
}
|
||||
|
||||
current_bytes += size;
|
||||
current.push(item.clone());
|
||||
}
|
||||
|
||||
if !current.is_empty() {
|
||||
batches.push(SizedBatch {
|
||||
approx_bytes: current_bytes.max(1),
|
||||
items: current,
|
||||
});
|
||||
}
|
||||
|
||||
Ok(batches)
|
||||
}
|
||||
|
||||
async fn execute_batched_inserts<T: Clone + Serialize + 'static>(
|
||||
db: &SurrealDbClient,
|
||||
statement: impl AsRef<str>,
|
||||
prefix: &str,
|
||||
batches: &[SizedBatch<T>],
|
||||
) -> Result<()> {
|
||||
if batches.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let mut start = 0;
|
||||
while start < batches.len() {
|
||||
let mut group_bytes = 0usize;
|
||||
let mut group_end = start;
|
||||
let mut group_count = 0usize;
|
||||
|
||||
while group_end < batches.len() {
|
||||
let batch_bytes = batches[group_end].approx_bytes.max(1);
|
||||
if group_count > 0
|
||||
&& (group_bytes + batch_bytes > REQUEST_MAX_BYTES
|
||||
|| group_count >= MAX_BATCHES_PER_REQUEST)
|
||||
{
|
||||
break;
|
||||
}
|
||||
group_bytes += batch_bytes;
|
||||
group_end += 1;
|
||||
group_count += 1;
|
||||
}
|
||||
|
||||
let slice = &batches[start..group_end];
|
||||
let mut query = db.client.query("BEGIN TRANSACTION;");
|
||||
for (bind_index, batch) in slice.iter().enumerate() {
|
||||
let name = format!("{prefix}{bind_index}");
|
||||
query = query
|
||||
.query(format!("{} ${};", statement.as_ref(), name))
|
||||
.bind((name, batch.items.clone()));
|
||||
}
|
||||
let response = query
|
||||
.query("COMMIT TRANSACTION;")
|
||||
.await
|
||||
.context("executing batched insert transaction")?;
|
||||
if let Err(err) = response.check() {
|
||||
return Err(anyhow!(
|
||||
"batched insert failed for statement '{}': {err:?}",
|
||||
statement.as_ref()
|
||||
));
|
||||
}
|
||||
|
||||
start = group_end;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn seed_manifest_into_db(db: &SurrealDbClient, manifest: &CorpusManifest) -> Result<()> {
|
||||
let batches = build_manifest_batches(manifest).context("preparing manifest batches")?;
|
||||
|
||||
let result = async {
|
||||
execute_batched_inserts(
|
||||
db,
|
||||
format!("INSERT INTO {}", TextContent::table_name()),
|
||||
"tc",
|
||||
&batches.text_contents,
|
||||
)
|
||||
.await?;
|
||||
|
||||
execute_batched_inserts(
|
||||
db,
|
||||
format!("INSERT INTO {}", KnowledgeEntity::table_name()),
|
||||
"ke",
|
||||
&batches.entities,
|
||||
)
|
||||
.await?;
|
||||
|
||||
execute_batched_inserts(
|
||||
db,
|
||||
format!("INSERT INTO {}", TextChunk::table_name()),
|
||||
"ch",
|
||||
&batches.chunks,
|
||||
)
|
||||
.await?;
|
||||
|
||||
execute_batched_inserts(
|
||||
db,
|
||||
"INSERT RELATION INTO relates_to",
|
||||
"rel",
|
||||
&batches.relationships,
|
||||
)
|
||||
.await?;
|
||||
|
||||
execute_batched_inserts(
|
||||
db,
|
||||
format!("INSERT INTO {}", KnowledgeEntityEmbedding::table_name()),
|
||||
"kee",
|
||||
&batches.entity_embeddings,
|
||||
)
|
||||
.await?;
|
||||
|
||||
execute_batched_inserts(
|
||||
db,
|
||||
format!("INSERT INTO {}", TextChunkEmbedding::table_name()),
|
||||
"tce",
|
||||
&batches.chunk_embeddings,
|
||||
)
|
||||
.await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
.await;
|
||||
|
||||
if result.is_err() {
|
||||
// Best-effort cleanup to avoid leaving partial manifest data behind.
|
||||
let _ = db
|
||||
.client
|
||||
.query(
|
||||
"BEGIN TRANSACTION;
|
||||
DELETE text_chunk_embedding;
|
||||
DELETE knowledge_entity_embedding;
|
||||
DELETE relates_to;
|
||||
DELETE text_chunk;
|
||||
DELETE knowledge_entity;
|
||||
DELETE text_content;
|
||||
COMMIT TRANSACTION;",
|
||||
)
|
||||
.await;
|
||||
}
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use chrono::Utc;
|
||||
use common::storage::types::knowledge_entity::KnowledgeEntityType;
|
||||
use uuid::Uuid;
|
||||
|
||||
fn build_manifest() -> CorpusManifest {
|
||||
let user_id = "user-1".to_string();
|
||||
let source_id = "source-1".to_string();
|
||||
let now = Utc::now();
|
||||
let text_content_id = Uuid::new_v4().to_string();
|
||||
|
||||
let text_content = TextContent {
|
||||
id: text_content_id.clone(),
|
||||
created_at: now,
|
||||
updated_at: now,
|
||||
text: "Hello world".to_string(),
|
||||
file_info: None,
|
||||
url_info: None,
|
||||
context: None,
|
||||
category: "test".to_string(),
|
||||
user_id: user_id.clone(),
|
||||
};
|
||||
|
||||
let entity = KnowledgeEntity {
|
||||
id: Uuid::new_v4().to_string(),
|
||||
created_at: now,
|
||||
updated_at: now,
|
||||
source_id: source_id.clone(),
|
||||
name: "Entity".to_string(),
|
||||
description: "A test entity".to_string(),
|
||||
entity_type: KnowledgeEntityType::Document,
|
||||
metadata: None,
|
||||
user_id: user_id.clone(),
|
||||
};
|
||||
let relationship = KnowledgeRelationship::new(
|
||||
format!("knowledge_entity:{}", entity.id),
|
||||
format!("knowledge_entity:{}", entity.id),
|
||||
user_id.clone(),
|
||||
source_id.clone(),
|
||||
"related".to_string(),
|
||||
);
|
||||
|
||||
let chunk = TextChunk {
|
||||
id: Uuid::new_v4().to_string(),
|
||||
created_at: now,
|
||||
updated_at: now,
|
||||
source_id: source_id.clone(),
|
||||
chunk: "chunk text".to_string(),
|
||||
user_id: user_id.clone(),
|
||||
};
|
||||
|
||||
let paragraph_one = CorpusParagraph {
|
||||
paragraph_id: "p1".to_string(),
|
||||
title: "Paragraph 1".to_string(),
|
||||
text_content: text_content.clone(),
|
||||
entities: vec![EmbeddedKnowledgeEntity {
|
||||
entity: entity.clone(),
|
||||
embedding: vec![0.1, 0.2, 0.3],
|
||||
}],
|
||||
relationships: vec![relationship],
|
||||
chunks: vec![EmbeddedTextChunk {
|
||||
chunk: chunk.clone(),
|
||||
embedding: vec![0.3, 0.2, 0.1],
|
||||
}],
|
||||
};
|
||||
|
||||
// Duplicate content/entities should be de-duplicated by the loader.
|
||||
let paragraph_two = CorpusParagraph {
|
||||
paragraph_id: "p2".to_string(),
|
||||
title: "Paragraph 2".to_string(),
|
||||
text_content: text_content.clone(),
|
||||
entities: vec![EmbeddedKnowledgeEntity {
|
||||
entity: entity.clone(),
|
||||
embedding: vec![0.1, 0.2, 0.3],
|
||||
}],
|
||||
relationships: Vec::new(),
|
||||
chunks: vec![EmbeddedTextChunk {
|
||||
chunk: chunk.clone(),
|
||||
embedding: vec![0.3, 0.2, 0.1],
|
||||
}],
|
||||
};
|
||||
|
||||
let question = CorpusQuestion {
|
||||
question_id: "q1".to_string(),
|
||||
paragraph_id: paragraph_one.paragraph_id.clone(),
|
||||
text_content_id: text_content_id,
|
||||
question_text: "What is this?".to_string(),
|
||||
answers: vec!["Hello".to_string()],
|
||||
is_impossible: false,
|
||||
matching_chunk_ids: vec![chunk.id.clone()],
|
||||
};
|
||||
|
||||
CorpusManifest {
|
||||
version: current_manifest_version(),
|
||||
metadata: CorpusMetadata {
|
||||
dataset_id: "dataset".to_string(),
|
||||
dataset_label: "Dataset".to_string(),
|
||||
slice_id: "slice".to_string(),
|
||||
include_unanswerable: false,
|
||||
require_verified_chunks: false,
|
||||
ingestion_fingerprint: "fp".to_string(),
|
||||
embedding_backend: "test".to_string(),
|
||||
embedding_model: Some("model".to_string()),
|
||||
embedding_dimension: 3,
|
||||
converted_checksum: "checksum".to_string(),
|
||||
generated_at: now,
|
||||
paragraph_count: 2,
|
||||
question_count: 1,
|
||||
chunk_min_tokens: 1,
|
||||
chunk_max_tokens: 10,
|
||||
chunk_only: false,
|
||||
},
|
||||
paragraphs: vec![paragraph_one, paragraph_two],
|
||||
questions: vec![question],
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn window_manifest_trims_questions_and_negatives() {
|
||||
let manifest = build_manifest();
|
||||
// Add extra negatives to simulate multiplier ~4x
|
||||
let mut manifest = manifest;
|
||||
let mut extra_paragraphs = Vec::new();
|
||||
for _ in 0..8 {
|
||||
let mut p = manifest.paragraphs[0].clone();
|
||||
p.paragraph_id = Uuid::new_v4().to_string();
|
||||
p.entities.clear();
|
||||
p.relationships.clear();
|
||||
p.chunks.clear();
|
||||
extra_paragraphs.push(p);
|
||||
}
|
||||
manifest.paragraphs.extend(extra_paragraphs);
|
||||
manifest.metadata.paragraph_count = manifest.paragraphs.len();
|
||||
|
||||
let windowed = window_manifest(&manifest, 0, 1, 4.0).expect("window manifest");
|
||||
assert_eq!(windowed.questions.len(), 1);
|
||||
// Expect roughly 4x negatives (bounded by available paragraphs)
|
||||
assert!(
|
||||
windowed.paragraphs.len() <= manifest.paragraphs.len(),
|
||||
"windowed paragraphs should never exceed original"
|
||||
);
|
||||
let positive_set: std::collections::HashSet<_> = windowed
|
||||
.questions
|
||||
.iter()
|
||||
.map(|q| q.paragraph_id.as_str())
|
||||
.collect();
|
||||
let positives = windowed
|
||||
.paragraphs
|
||||
.iter()
|
||||
.filter(|p| positive_set.contains(p.paragraph_id.as_str()))
|
||||
.count();
|
||||
let negatives = windowed.paragraphs.len().saturating_sub(positives);
|
||||
assert_eq!(positives, 1);
|
||||
assert!(negatives >= 1, "should include some negatives");
|
||||
}
|
||||
}
|
||||
341
evaluations/src/datasets/beir.rs
Normal file
341
evaluations/src/datasets/beir.rs
Normal file
@@ -0,0 +1,341 @@
|
||||
use std::{
|
||||
collections::{BTreeMap, HashMap},
|
||||
fs::File,
|
||||
io::{BufRead, BufReader},
|
||||
path::{Path, PathBuf},
|
||||
};
|
||||
|
||||
use anyhow::{anyhow, Context, Result};
|
||||
use serde::Deserialize;
|
||||
use tracing::warn;
|
||||
|
||||
use super::{ConvertedParagraph, ConvertedQuestion, DatasetKind};
|
||||
|
||||
const ANSWER_SNIPPET_CHARS: usize = 240;
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
struct BeirCorpusRow {
|
||||
#[serde(rename = "_id")]
|
||||
id: String,
|
||||
#[serde(default)]
|
||||
title: Option<String>,
|
||||
#[serde(default)]
|
||||
text: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
struct BeirQueryRow {
|
||||
#[serde(rename = "_id")]
|
||||
id: String,
|
||||
text: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
struct BeirParagraph {
|
||||
title: String,
|
||||
context: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
struct BeirQuery {
|
||||
text: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
struct QrelEntry {
|
||||
doc_id: String,
|
||||
score: i32,
|
||||
}
|
||||
|
||||
pub fn convert_beir(raw_dir: &Path, dataset: DatasetKind) -> Result<Vec<ConvertedParagraph>> {
|
||||
let corpus_path = raw_dir.join("corpus.jsonl");
|
||||
let queries_path = raw_dir.join("queries.jsonl");
|
||||
let qrels_path = resolve_qrels_path(raw_dir)?;
|
||||
|
||||
let corpus = load_corpus(&corpus_path)?;
|
||||
let queries = load_queries(&queries_path)?;
|
||||
let qrels = load_qrels(&qrels_path)?;
|
||||
|
||||
let mut paragraphs = Vec::with_capacity(corpus.len());
|
||||
let mut paragraph_index = HashMap::new();
|
||||
|
||||
for (doc_id, entry) in corpus.iter() {
|
||||
let paragraph_id = format!("{}-{doc_id}", dataset.source_prefix());
|
||||
let paragraph = ConvertedParagraph {
|
||||
id: paragraph_id.clone(),
|
||||
title: entry.title.clone(),
|
||||
context: entry.context.clone(),
|
||||
questions: Vec::new(),
|
||||
};
|
||||
paragraph_index.insert(doc_id.clone(), paragraphs.len());
|
||||
paragraphs.push(paragraph);
|
||||
}
|
||||
|
||||
let mut missing_queries = 0usize;
|
||||
let mut missing_docs = 0usize;
|
||||
let mut skipped_answers = 0usize;
|
||||
|
||||
for (query_id, entries) in qrels {
|
||||
let query = match queries.get(&query_id) {
|
||||
Some(query) => query,
|
||||
None => {
|
||||
missing_queries += 1;
|
||||
warn!(query_id = %query_id, "Skipping qrels entry for missing query");
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
let best = match select_best_doc(&entries) {
|
||||
Some(entry) => entry,
|
||||
None => continue,
|
||||
};
|
||||
|
||||
let paragraph_slot = match paragraph_index.get(&best.doc_id) {
|
||||
Some(slot) => *slot,
|
||||
None => {
|
||||
missing_docs += 1;
|
||||
warn!(
|
||||
query_id = %query_id,
|
||||
doc_id = %best.doc_id,
|
||||
"Skipping qrels entry referencing missing corpus document"
|
||||
);
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
let answer = answer_snippet(¶graphs[paragraph_slot].context);
|
||||
let answers = match answer {
|
||||
Some(snippet) => vec![snippet],
|
||||
None => {
|
||||
skipped_answers += 1;
|
||||
warn!(
|
||||
query_id = %query_id,
|
||||
doc_id = %best.doc_id,
|
||||
"Skipping query because no non-empty answer snippet could be derived"
|
||||
);
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
let question_id = format!("{}-{query_id}", dataset.source_prefix());
|
||||
paragraphs[paragraph_slot]
|
||||
.questions
|
||||
.push(ConvertedQuestion {
|
||||
id: question_id,
|
||||
question: query.text.clone(),
|
||||
answers,
|
||||
is_impossible: false,
|
||||
});
|
||||
}
|
||||
|
||||
if missing_queries + missing_docs + skipped_answers > 0 {
|
||||
warn!(
|
||||
missing_queries,
|
||||
missing_docs, skipped_answers, "Skipped some BEIR qrels entries during conversion"
|
||||
);
|
||||
}
|
||||
|
||||
Ok(paragraphs)
|
||||
}
|
||||
|
||||
fn resolve_qrels_path(raw_dir: &Path) -> Result<PathBuf> {
|
||||
let qrels_dir = raw_dir.join("qrels");
|
||||
let candidates = ["test.tsv", "dev.tsv", "train.tsv"];
|
||||
|
||||
for name in candidates {
|
||||
let candidate = qrels_dir.join(name);
|
||||
if candidate.exists() {
|
||||
return Ok(candidate);
|
||||
}
|
||||
}
|
||||
|
||||
Err(anyhow!(
|
||||
"No qrels file found under {}; expected one of {:?}",
|
||||
qrels_dir.display(),
|
||||
candidates
|
||||
))
|
||||
}
|
||||
|
||||
fn load_corpus(path: &Path) -> Result<BTreeMap<String, BeirParagraph>> {
|
||||
let file =
|
||||
File::open(path).with_context(|| format!("opening BEIR corpus at {}", path.display()))?;
|
||||
let reader = BufReader::new(file);
|
||||
let mut corpus = BTreeMap::new();
|
||||
|
||||
for (idx, line) in reader.lines().enumerate() {
|
||||
let raw = line
|
||||
.with_context(|| format!("reading corpus line {} from {}", idx + 1, path.display()))?;
|
||||
if raw.trim().is_empty() {
|
||||
continue;
|
||||
}
|
||||
let row: BeirCorpusRow = serde_json::from_str(&raw).with_context(|| {
|
||||
format!(
|
||||
"parsing corpus JSON on line {} from {}",
|
||||
idx + 1,
|
||||
path.display()
|
||||
)
|
||||
})?;
|
||||
let title = row.title.unwrap_or_else(|| row.id.clone());
|
||||
let text = row.text.unwrap_or_default();
|
||||
let context = build_context(&title, &text);
|
||||
|
||||
if context.is_empty() {
|
||||
warn!(doc_id = %row.id, "Skipping empty corpus document");
|
||||
continue;
|
||||
}
|
||||
|
||||
corpus.insert(row.id, BeirParagraph { title, context });
|
||||
}
|
||||
|
||||
Ok(corpus)
|
||||
}
|
||||
|
||||
fn load_queries(path: &Path) -> Result<BTreeMap<String, BeirQuery>> {
|
||||
let file = File::open(path)
|
||||
.with_context(|| format!("opening BEIR queries file at {}", path.display()))?;
|
||||
let reader = BufReader::new(file);
|
||||
let mut queries = BTreeMap::new();
|
||||
|
||||
for (idx, line) in reader.lines().enumerate() {
|
||||
let raw = line
|
||||
.with_context(|| format!("reading query line {} from {}", idx + 1, path.display()))?;
|
||||
if raw.trim().is_empty() {
|
||||
continue;
|
||||
}
|
||||
let row: BeirQueryRow = serde_json::from_str(&raw).with_context(|| {
|
||||
format!(
|
||||
"parsing query JSON on line {} from {}",
|
||||
idx + 1,
|
||||
path.display()
|
||||
)
|
||||
})?;
|
||||
queries.insert(
|
||||
row.id,
|
||||
BeirQuery {
|
||||
text: row.text.trim().to_string(),
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
Ok(queries)
|
||||
}
|
||||
|
||||
fn load_qrels(path: &Path) -> Result<BTreeMap<String, Vec<QrelEntry>>> {
|
||||
let file =
|
||||
File::open(path).with_context(|| format!("opening BEIR qrels at {}", path.display()))?;
|
||||
let reader = BufReader::new(file);
|
||||
let mut qrels: BTreeMap<String, Vec<QrelEntry>> = BTreeMap::new();
|
||||
|
||||
for (idx, line) in reader.lines().enumerate() {
|
||||
let raw = line
|
||||
.with_context(|| format!("reading qrels line {} from {}", idx + 1, path.display()))?;
|
||||
let trimmed = raw.trim();
|
||||
if trimmed.is_empty() || trimmed.starts_with("query-id") {
|
||||
continue;
|
||||
}
|
||||
let mut parts = trimmed.split_whitespace();
|
||||
let query_id = parts
|
||||
.next()
|
||||
.ok_or_else(|| anyhow!("missing query id on line {}", idx + 1))?;
|
||||
let doc_id = parts
|
||||
.next()
|
||||
.ok_or_else(|| anyhow!("missing document id on line {}", idx + 1))?;
|
||||
let score_raw = parts
|
||||
.next()
|
||||
.ok_or_else(|| anyhow!("missing score on line {}", idx + 1))?;
|
||||
let score: i32 = score_raw.parse().with_context(|| {
|
||||
format!(
|
||||
"parsing qrels score '{}' on line {} from {}",
|
||||
score_raw,
|
||||
idx + 1,
|
||||
path.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
qrels
|
||||
.entry(query_id.to_string())
|
||||
.or_default()
|
||||
.push(QrelEntry {
|
||||
doc_id: doc_id.to_string(),
|
||||
score,
|
||||
});
|
||||
}
|
||||
|
||||
Ok(qrels)
|
||||
}
|
||||
|
||||
fn select_best_doc(entries: &[QrelEntry]) -> Option<&QrelEntry> {
|
||||
entries
|
||||
.iter()
|
||||
.max_by(|a, b| a.score.cmp(&b.score).then_with(|| b.doc_id.cmp(&a.doc_id)))
|
||||
}
|
||||
|
||||
fn answer_snippet(text: &str) -> Option<String> {
|
||||
let trimmed = text.trim();
|
||||
if trimmed.is_empty() {
|
||||
return None;
|
||||
}
|
||||
let snippet: String = trimmed.chars().take(ANSWER_SNIPPET_CHARS).collect();
|
||||
let snippet = snippet.trim();
|
||||
if snippet.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(snippet.to_string())
|
||||
}
|
||||
}
|
||||
|
||||
fn build_context(title: &str, text: &str) -> String {
|
||||
let title = title.trim();
|
||||
let text = text.trim();
|
||||
|
||||
match (title.is_empty(), text.is_empty()) {
|
||||
(true, true) => String::new(),
|
||||
(true, false) => text.to_string(),
|
||||
(false, true) => title.to_string(),
|
||||
(false, false) => format!("{title}\n\n{text}"),
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::fs;
|
||||
use tempfile::tempdir;
|
||||
|
||||
#[test]
|
||||
fn converts_basic_beir_layout() {
|
||||
let dir = tempdir().unwrap();
|
||||
let corpus = r#"
|
||||
{"_id":"d1","title":"Doc 1","text":"Doc one has some text for testing."}
|
||||
{"_id":"d2","title":"Doc 2","text":"Second document content."}
|
||||
"#;
|
||||
let queries = r#"
|
||||
{"_id":"q1","text":"What is in doc one?"}
|
||||
"#;
|
||||
let qrels = "query-id\tcorpus-id\tscore\nq1\td1\t2\n";
|
||||
|
||||
fs::write(dir.path().join("corpus.jsonl"), corpus.trim()).unwrap();
|
||||
fs::write(dir.path().join("queries.jsonl"), queries.trim()).unwrap();
|
||||
fs::create_dir_all(dir.path().join("qrels")).unwrap();
|
||||
fs::write(dir.path().join("qrels/test.tsv"), qrels).unwrap();
|
||||
|
||||
let paragraphs = convert_beir(dir.path(), DatasetKind::Fever).unwrap();
|
||||
|
||||
assert_eq!(paragraphs.len(), 2);
|
||||
let doc_one = paragraphs
|
||||
.iter()
|
||||
.find(|p| p.id == "fever-d1")
|
||||
.expect("missing paragraph for d1");
|
||||
assert_eq!(doc_one.questions.len(), 1);
|
||||
let question = &doc_one.questions[0];
|
||||
assert_eq!(question.id, "fever-q1");
|
||||
assert!(!question.answers.is_empty());
|
||||
assert!(doc_one.context.contains(&question.answers[0]));
|
||||
|
||||
let doc_two = paragraphs
|
||||
.iter()
|
||||
.find(|p| p.id == "fever-d2")
|
||||
.expect("missing paragraph for d2");
|
||||
assert!(doc_two.questions.is_empty());
|
||||
}
|
||||
}
|
||||
623
evaluations/src/datasets/mod.rs
Normal file
623
evaluations/src/datasets/mod.rs
Normal file
@@ -0,0 +1,623 @@
|
||||
mod beir;
|
||||
mod nq;
|
||||
mod squad;
|
||||
|
||||
use std::{
|
||||
collections::{BTreeMap, HashMap},
|
||||
fs::{self},
|
||||
path::{Path, PathBuf},
|
||||
str::FromStr,
|
||||
};
|
||||
|
||||
use anyhow::{anyhow, bail, Context, Result};
|
||||
use chrono::{DateTime, TimeZone, Utc};
|
||||
use clap::ValueEnum;
|
||||
use once_cell::sync::OnceCell;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tracing::warn;
|
||||
|
||||
const MANIFEST_PATH: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/manifest.yaml");
|
||||
static DATASET_CATALOG: OnceCell<DatasetCatalog> = OnceCell::new();
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
#[allow(dead_code)]
|
||||
pub struct DatasetCatalog {
|
||||
datasets: BTreeMap<String, DatasetEntry>,
|
||||
slices: HashMap<String, SliceLocation>,
|
||||
default_dataset: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
#[allow(dead_code)]
|
||||
pub struct DatasetEntry {
|
||||
pub metadata: DatasetMetadata,
|
||||
pub raw_path: PathBuf,
|
||||
pub converted_path: PathBuf,
|
||||
pub include_unanswerable: bool,
|
||||
pub slices: Vec<SliceEntry>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
#[allow(dead_code)]
|
||||
pub struct SliceEntry {
|
||||
pub id: String,
|
||||
pub dataset_id: String,
|
||||
pub label: String,
|
||||
pub description: Option<String>,
|
||||
pub limit: Option<usize>,
|
||||
pub corpus_limit: Option<usize>,
|
||||
pub include_unanswerable: Option<bool>,
|
||||
pub seed: Option<u64>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
#[allow(dead_code)]
|
||||
struct SliceLocation {
|
||||
dataset_id: String,
|
||||
slice_index: usize,
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
struct ManifestFile {
|
||||
default_dataset: Option<String>,
|
||||
datasets: Vec<ManifestDataset>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
struct ManifestDataset {
|
||||
id: String,
|
||||
label: String,
|
||||
category: String,
|
||||
#[serde(default)]
|
||||
entity_suffix: Option<String>,
|
||||
#[serde(default)]
|
||||
source_prefix: Option<String>,
|
||||
raw: String,
|
||||
converted: String,
|
||||
#[serde(default)]
|
||||
include_unanswerable: bool,
|
||||
#[serde(default)]
|
||||
slices: Vec<ManifestSlice>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
struct ManifestSlice {
|
||||
id: String,
|
||||
label: String,
|
||||
#[serde(default)]
|
||||
description: Option<String>,
|
||||
#[serde(default)]
|
||||
limit: Option<usize>,
|
||||
#[serde(default)]
|
||||
corpus_limit: Option<usize>,
|
||||
#[serde(default)]
|
||||
include_unanswerable: Option<bool>,
|
||||
#[serde(default)]
|
||||
seed: Option<u64>,
|
||||
}
|
||||
|
||||
impl DatasetCatalog {
|
||||
pub fn load() -> Result<Self> {
|
||||
let manifest_raw = fs::read_to_string(MANIFEST_PATH)
|
||||
.with_context(|| format!("reading dataset manifest at {}", MANIFEST_PATH))?;
|
||||
let manifest: ManifestFile = serde_yaml::from_str(&manifest_raw)
|
||||
.with_context(|| format!("parsing dataset manifest at {}", MANIFEST_PATH))?;
|
||||
|
||||
let root = Path::new(env!("CARGO_MANIFEST_DIR"));
|
||||
let mut datasets = BTreeMap::new();
|
||||
let mut slices = HashMap::new();
|
||||
|
||||
for dataset in manifest.datasets {
|
||||
let raw_path = resolve_path(root, &dataset.raw);
|
||||
let converted_path = resolve_path(root, &dataset.converted);
|
||||
|
||||
if !raw_path.exists() {
|
||||
bail!(
|
||||
"dataset '{}' raw file missing at {}",
|
||||
dataset.id,
|
||||
raw_path.display()
|
||||
);
|
||||
}
|
||||
if !converted_path.exists() {
|
||||
warn!(
|
||||
"dataset '{}' converted file missing at {}; the next conversion run will regenerate it",
|
||||
dataset.id,
|
||||
converted_path.display()
|
||||
);
|
||||
}
|
||||
|
||||
let metadata = DatasetMetadata {
|
||||
id: dataset.id.clone(),
|
||||
label: dataset.label.clone(),
|
||||
category: dataset.category.clone(),
|
||||
entity_suffix: dataset
|
||||
.entity_suffix
|
||||
.clone()
|
||||
.unwrap_or_else(|| dataset.label.clone()),
|
||||
source_prefix: dataset
|
||||
.source_prefix
|
||||
.clone()
|
||||
.unwrap_or_else(|| dataset.id.clone()),
|
||||
include_unanswerable: dataset.include_unanswerable,
|
||||
context_token_limit: None,
|
||||
};
|
||||
|
||||
let mut entry_slices = Vec::with_capacity(dataset.slices.len());
|
||||
|
||||
for (index, manifest_slice) in dataset.slices.into_iter().enumerate() {
|
||||
if slices.contains_key(&manifest_slice.id) {
|
||||
bail!(
|
||||
"slice '{}' defined multiple times in manifest",
|
||||
manifest_slice.id
|
||||
);
|
||||
}
|
||||
entry_slices.push(SliceEntry {
|
||||
id: manifest_slice.id.clone(),
|
||||
dataset_id: dataset.id.clone(),
|
||||
label: manifest_slice.label,
|
||||
description: manifest_slice.description,
|
||||
limit: manifest_slice.limit,
|
||||
corpus_limit: manifest_slice.corpus_limit,
|
||||
include_unanswerable: manifest_slice.include_unanswerable,
|
||||
seed: manifest_slice.seed,
|
||||
});
|
||||
slices.insert(
|
||||
manifest_slice.id,
|
||||
SliceLocation {
|
||||
dataset_id: dataset.id.clone(),
|
||||
slice_index: index,
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
datasets.insert(
|
||||
metadata.id.clone(),
|
||||
DatasetEntry {
|
||||
metadata,
|
||||
raw_path,
|
||||
converted_path,
|
||||
include_unanswerable: dataset.include_unanswerable,
|
||||
slices: entry_slices,
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
let default_dataset = manifest
|
||||
.default_dataset
|
||||
.or_else(|| datasets.keys().next().cloned())
|
||||
.ok_or_else(|| anyhow!("dataset manifest does not include any datasets"))?;
|
||||
|
||||
Ok(Self {
|
||||
datasets,
|
||||
slices,
|
||||
default_dataset,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn global() -> Result<&'static Self> {
|
||||
DATASET_CATALOG.get_or_try_init(Self::load)
|
||||
}
|
||||
|
||||
pub fn dataset(&self, id: &str) -> Result<&DatasetEntry> {
|
||||
self.datasets
|
||||
.get(id)
|
||||
.ok_or_else(|| anyhow!("unknown dataset '{id}' in manifest"))
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
pub fn default_dataset(&self) -> Result<&DatasetEntry> {
|
||||
self.dataset(&self.default_dataset)
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
pub fn slice(&self, slice_id: &str) -> Result<(&DatasetEntry, &SliceEntry)> {
|
||||
let location = self
|
||||
.slices
|
||||
.get(slice_id)
|
||||
.ok_or_else(|| anyhow!("unknown slice '{slice_id}' in manifest"))?;
|
||||
let dataset = self
|
||||
.datasets
|
||||
.get(&location.dataset_id)
|
||||
.ok_or_else(|| anyhow!("slice '{slice_id}' references missing dataset"))?;
|
||||
let slice = dataset
|
||||
.slices
|
||||
.get(location.slice_index)
|
||||
.ok_or_else(|| anyhow!("slice index out of bounds for '{slice_id}'"))?;
|
||||
Ok((dataset, slice))
|
||||
}
|
||||
}
|
||||
|
||||
fn resolve_path(root: &Path, value: &str) -> PathBuf {
|
||||
let path = Path::new(value);
|
||||
if path.is_absolute() {
|
||||
path.to_path_buf()
|
||||
} else {
|
||||
root.join(path)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn catalog() -> Result<&'static DatasetCatalog> {
|
||||
DatasetCatalog::global()
|
||||
}
|
||||
|
||||
fn dataset_entry_for_kind(kind: DatasetKind) -> Result<&'static DatasetEntry> {
|
||||
let catalog = catalog()?;
|
||||
catalog.dataset(kind.id())
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, ValueEnum, Default)]
|
||||
pub enum DatasetKind {
|
||||
#[default]
|
||||
SquadV2,
|
||||
NaturalQuestions,
|
||||
Beir,
|
||||
#[value(name = "fever")]
|
||||
Fever,
|
||||
#[value(name = "fiqa")]
|
||||
Fiqa,
|
||||
#[value(name = "hotpotqa", alias = "hotpot-qa")]
|
||||
HotpotQa,
|
||||
#[value(name = "nfcorpus", alias = "nf-corpus")]
|
||||
Nfcorpus,
|
||||
#[value(name = "quora")]
|
||||
Quora,
|
||||
#[value(name = "trec-covid", alias = "treccovid", alias = "trec_covid")]
|
||||
TrecCovid,
|
||||
#[value(name = "scifact")]
|
||||
Scifact,
|
||||
#[value(name = "nq-beir", alias = "natural-questions-beir")]
|
||||
NqBeir,
|
||||
}
|
||||
|
||||
impl DatasetKind {
|
||||
pub fn id(self) -> &'static str {
|
||||
match self {
|
||||
Self::SquadV2 => "squad-v2",
|
||||
Self::NaturalQuestions => "natural-questions-dev",
|
||||
Self::Beir => "beir",
|
||||
Self::Fever => "fever",
|
||||
Self::Fiqa => "fiqa",
|
||||
Self::HotpotQa => "hotpotqa",
|
||||
Self::Nfcorpus => "nfcorpus",
|
||||
Self::Quora => "quora",
|
||||
Self::TrecCovid => "trec-covid",
|
||||
Self::Scifact => "scifact",
|
||||
Self::NqBeir => "nq-beir",
|
||||
}
|
||||
}
|
||||
|
||||
pub fn label(self) -> &'static str {
|
||||
match self {
|
||||
Self::SquadV2 => "SQuAD v2.0",
|
||||
Self::NaturalQuestions => "Natural Questions (dev)",
|
||||
Self::Beir => "BEIR mix",
|
||||
Self::Fever => "FEVER (BEIR)",
|
||||
Self::Fiqa => "FiQA-2018 (BEIR)",
|
||||
Self::HotpotQa => "HotpotQA (BEIR)",
|
||||
Self::Nfcorpus => "NFCorpus (BEIR)",
|
||||
Self::Quora => "Quora (IR)",
|
||||
Self::TrecCovid => "TREC-COVID (BEIR)",
|
||||
Self::Scifact => "SciFact (BEIR)",
|
||||
Self::NqBeir => "Natural Questions (BEIR)",
|
||||
}
|
||||
}
|
||||
|
||||
pub fn category(self) -> &'static str {
|
||||
match self {
|
||||
Self::SquadV2 => "SQuAD v2.0",
|
||||
Self::NaturalQuestions => "Natural Questions",
|
||||
Self::Beir => "BEIR",
|
||||
Self::Fever => "FEVER",
|
||||
Self::Fiqa => "FiQA-2018",
|
||||
Self::HotpotQa => "HotpotQA",
|
||||
Self::Nfcorpus => "NFCorpus",
|
||||
Self::Quora => "Quora",
|
||||
Self::TrecCovid => "TREC-COVID",
|
||||
Self::Scifact => "SciFact",
|
||||
Self::NqBeir => "Natural Questions",
|
||||
}
|
||||
}
|
||||
|
||||
pub fn entity_suffix(self) -> &'static str {
|
||||
match self {
|
||||
Self::SquadV2 => "SQuAD",
|
||||
Self::NaturalQuestions => "Natural Questions",
|
||||
Self::Beir => "BEIR",
|
||||
Self::Fever => "FEVER",
|
||||
Self::Fiqa => "FiQA",
|
||||
Self::HotpotQa => "HotpotQA",
|
||||
Self::Nfcorpus => "NFCorpus",
|
||||
Self::Quora => "Quora",
|
||||
Self::TrecCovid => "TREC-COVID",
|
||||
Self::Scifact => "SciFact",
|
||||
Self::NqBeir => "Natural Questions",
|
||||
}
|
||||
}
|
||||
|
||||
pub fn source_prefix(self) -> &'static str {
|
||||
match self {
|
||||
Self::SquadV2 => "squad",
|
||||
Self::NaturalQuestions => "nq",
|
||||
Self::Beir => "beir",
|
||||
Self::Fever => "fever",
|
||||
Self::Fiqa => "fiqa",
|
||||
Self::HotpotQa => "hotpotqa",
|
||||
Self::Nfcorpus => "nfcorpus",
|
||||
Self::Quora => "quora",
|
||||
Self::TrecCovid => "trec-covid",
|
||||
Self::Scifact => "scifact",
|
||||
Self::NqBeir => "nq-beir",
|
||||
}
|
||||
}
|
||||
|
||||
pub fn default_raw_path(self) -> PathBuf {
|
||||
dataset_entry_for_kind(self)
|
||||
.map(|entry| entry.raw_path.clone())
|
||||
.unwrap_or_else(|err| panic!("dataset manifest missing entry for {:?}: {err}", self))
|
||||
}
|
||||
|
||||
pub fn default_converted_path(self) -> PathBuf {
|
||||
dataset_entry_for_kind(self)
|
||||
.map(|entry| entry.converted_path.clone())
|
||||
.unwrap_or_else(|err| panic!("dataset manifest missing entry for {:?}: {err}", self))
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for DatasetKind {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{}", self.id())
|
||||
}
|
||||
}
|
||||
|
||||
impl FromStr for DatasetKind {
|
||||
type Err = anyhow::Error;
|
||||
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
match s.to_ascii_lowercase().as_str() {
|
||||
"squad" | "squad-v2" | "squad_v2" => Ok(Self::SquadV2),
|
||||
"nq" | "natural-questions" | "natural_questions" | "natural-questions-dev" => {
|
||||
Ok(Self::NaturalQuestions)
|
||||
}
|
||||
"beir" => Ok(Self::Beir),
|
||||
"fever" => Ok(Self::Fever),
|
||||
"fiqa" | "fiqa-2018" => Ok(Self::Fiqa),
|
||||
"hotpotqa" | "hotpot-qa" => Ok(Self::HotpotQa),
|
||||
"nfcorpus" | "nf-corpus" => Ok(Self::Nfcorpus),
|
||||
"quora" => Ok(Self::Quora),
|
||||
"trec-covid" | "treccovid" | "trec_covid" => Ok(Self::TrecCovid),
|
||||
"scifact" => Ok(Self::Scifact),
|
||||
"nq-beir" | "natural-questions-beir" => Ok(Self::NqBeir),
|
||||
other => {
|
||||
anyhow::bail!("unknown dataset '{other}'. Expected one of: squad, natural-questions, beir, fever, fiqa, hotpotqa, nfcorpus, quora, trec-covid, scifact, nq-beir.")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub const BEIR_DATASETS: [DatasetKind; 8] = [
|
||||
DatasetKind::Fever,
|
||||
DatasetKind::Fiqa,
|
||||
DatasetKind::HotpotQa,
|
||||
DatasetKind::Nfcorpus,
|
||||
DatasetKind::Quora,
|
||||
DatasetKind::TrecCovid,
|
||||
DatasetKind::Scifact,
|
||||
DatasetKind::NqBeir,
|
||||
];
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct DatasetMetadata {
|
||||
pub id: String,
|
||||
pub label: String,
|
||||
pub category: String,
|
||||
pub entity_suffix: String,
|
||||
pub source_prefix: String,
|
||||
#[serde(default)]
|
||||
pub include_unanswerable: bool,
|
||||
#[serde(default)]
|
||||
pub context_token_limit: Option<usize>,
|
||||
}
|
||||
|
||||
impl DatasetMetadata {
|
||||
pub fn for_kind(
|
||||
kind: DatasetKind,
|
||||
include_unanswerable: bool,
|
||||
context_token_limit: Option<usize>,
|
||||
) -> Self {
|
||||
if let Ok(entry) = dataset_entry_for_kind(kind) {
|
||||
return Self {
|
||||
id: entry.metadata.id.clone(),
|
||||
label: entry.metadata.label.clone(),
|
||||
category: entry.metadata.category.clone(),
|
||||
entity_suffix: entry.metadata.entity_suffix.clone(),
|
||||
source_prefix: entry.metadata.source_prefix.clone(),
|
||||
include_unanswerable,
|
||||
context_token_limit,
|
||||
};
|
||||
}
|
||||
|
||||
Self {
|
||||
id: kind.id().to_string(),
|
||||
label: kind.label().to_string(),
|
||||
category: kind.category().to_string(),
|
||||
entity_suffix: kind.entity_suffix().to_string(),
|
||||
source_prefix: kind.source_prefix().to_string(),
|
||||
include_unanswerable,
|
||||
context_token_limit,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn default_metadata() -> DatasetMetadata {
|
||||
DatasetMetadata::for_kind(DatasetKind::default(), false, None)
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct ConvertedDataset {
|
||||
pub generated_at: DateTime<Utc>,
|
||||
#[serde(default = "default_metadata")]
|
||||
pub metadata: DatasetMetadata,
|
||||
pub source: String,
|
||||
pub paragraphs: Vec<ConvertedParagraph>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct ConvertedParagraph {
|
||||
pub id: String,
|
||||
pub title: String,
|
||||
pub context: String,
|
||||
pub questions: Vec<ConvertedQuestion>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct ConvertedQuestion {
|
||||
pub id: String,
|
||||
pub question: String,
|
||||
pub answers: Vec<String>,
|
||||
pub is_impossible: bool,
|
||||
}
|
||||
|
||||
pub fn convert(
|
||||
raw_path: &Path,
|
||||
dataset: DatasetKind,
|
||||
include_unanswerable: bool,
|
||||
context_token_limit: Option<usize>,
|
||||
) -> Result<ConvertedDataset> {
|
||||
let paragraphs = match dataset {
|
||||
DatasetKind::SquadV2 => squad::convert_squad(raw_path)?,
|
||||
DatasetKind::NaturalQuestions => {
|
||||
nq::convert_nq(raw_path, include_unanswerable, context_token_limit)?
|
||||
}
|
||||
DatasetKind::Beir => convert_beir_mix(include_unanswerable, context_token_limit)?,
|
||||
DatasetKind::Fever
|
||||
| DatasetKind::Fiqa
|
||||
| DatasetKind::HotpotQa
|
||||
| DatasetKind::Nfcorpus
|
||||
| DatasetKind::Quora
|
||||
| DatasetKind::TrecCovid
|
||||
| DatasetKind::Scifact
|
||||
| DatasetKind::NqBeir => beir::convert_beir(raw_path, dataset)?,
|
||||
};
|
||||
|
||||
let metadata_limit = match dataset {
|
||||
DatasetKind::NaturalQuestions => None,
|
||||
_ => context_token_limit,
|
||||
};
|
||||
|
||||
let generated_at = match dataset {
|
||||
DatasetKind::Beir
|
||||
| DatasetKind::Fever
|
||||
| DatasetKind::Fiqa
|
||||
| DatasetKind::HotpotQa
|
||||
| DatasetKind::Nfcorpus
|
||||
| DatasetKind::Quora
|
||||
| DatasetKind::TrecCovid
|
||||
| DatasetKind::Scifact
|
||||
| DatasetKind::NqBeir => base_timestamp(),
|
||||
_ => Utc::now(),
|
||||
};
|
||||
|
||||
let source_label = match dataset {
|
||||
DatasetKind::Beir => "beir-mix".to_string(),
|
||||
_ => raw_path.display().to_string(),
|
||||
};
|
||||
|
||||
Ok(ConvertedDataset {
|
||||
generated_at,
|
||||
metadata: DatasetMetadata::for_kind(dataset, include_unanswerable, metadata_limit),
|
||||
source: source_label,
|
||||
paragraphs,
|
||||
})
|
||||
}
|
||||
|
||||
fn convert_beir_mix(
|
||||
include_unanswerable: bool,
|
||||
_context_token_limit: Option<usize>,
|
||||
) -> Result<Vec<ConvertedParagraph>> {
|
||||
if include_unanswerable {
|
||||
warn!("BEIR mix ignores include_unanswerable flag; all questions are answerable");
|
||||
}
|
||||
|
||||
let mut paragraphs = Vec::new();
|
||||
for subset in BEIR_DATASETS {
|
||||
let entry = dataset_entry_for_kind(subset)?;
|
||||
let subset_paragraphs = beir::convert_beir(&entry.raw_path, subset)?;
|
||||
paragraphs.extend(subset_paragraphs);
|
||||
}
|
||||
|
||||
Ok(paragraphs)
|
||||
}
|
||||
|
||||
fn ensure_parent(path: &Path) -> Result<()> {
|
||||
if let Some(parent) = path.parent() {
|
||||
fs::create_dir_all(parent)
|
||||
.with_context(|| format!("creating parent directory for {}", path.display()))?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn write_converted(dataset: &ConvertedDataset, converted_path: &Path) -> Result<()> {
|
||||
ensure_parent(converted_path)?;
|
||||
let json =
|
||||
serde_json::to_string_pretty(dataset).context("serialising converted dataset to JSON")?;
|
||||
fs::write(converted_path, json)
|
||||
.with_context(|| format!("writing converted dataset to {}", converted_path.display()))
|
||||
}
|
||||
|
||||
pub fn read_converted(converted_path: &Path) -> Result<ConvertedDataset> {
|
||||
let raw = fs::read_to_string(converted_path)
|
||||
.with_context(|| format!("reading converted dataset at {}", converted_path.display()))?;
|
||||
let mut dataset: ConvertedDataset = serde_json::from_str(&raw)
|
||||
.with_context(|| format!("parsing converted dataset at {}", converted_path.display()))?;
|
||||
if dataset.metadata.id.trim().is_empty() {
|
||||
dataset.metadata = default_metadata();
|
||||
}
|
||||
if dataset.source.is_empty() {
|
||||
dataset.source = converted_path.display().to_string();
|
||||
}
|
||||
Ok(dataset)
|
||||
}
|
||||
|
||||
pub fn ensure_converted(
|
||||
dataset_kind: DatasetKind,
|
||||
raw_path: &Path,
|
||||
converted_path: &Path,
|
||||
force: bool,
|
||||
include_unanswerable: bool,
|
||||
context_token_limit: Option<usize>,
|
||||
) -> Result<ConvertedDataset> {
|
||||
if force || !converted_path.exists() {
|
||||
let dataset = convert(
|
||||
raw_path,
|
||||
dataset_kind,
|
||||
include_unanswerable,
|
||||
context_token_limit,
|
||||
)?;
|
||||
write_converted(&dataset, converted_path)?;
|
||||
return Ok(dataset);
|
||||
}
|
||||
|
||||
match read_converted(converted_path) {
|
||||
Ok(dataset)
|
||||
if dataset.metadata.id == dataset_kind.id()
|
||||
&& dataset.metadata.include_unanswerable == include_unanswerable
|
||||
&& dataset.metadata.context_token_limit == context_token_limit =>
|
||||
{
|
||||
Ok(dataset)
|
||||
}
|
||||
_ => {
|
||||
let dataset = convert(
|
||||
raw_path,
|
||||
dataset_kind,
|
||||
include_unanswerable,
|
||||
context_token_limit,
|
||||
)?;
|
||||
write_converted(&dataset, converted_path)?;
|
||||
Ok(dataset)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn base_timestamp() -> DateTime<Utc> {
|
||||
Utc.with_ymd_and_hms(2023, 1, 1, 0, 0, 0).unwrap()
|
||||
}
|
||||
234
evaluations/src/datasets/nq.rs
Normal file
234
evaluations/src/datasets/nq.rs
Normal file
@@ -0,0 +1,234 @@
|
||||
use std::{
|
||||
collections::BTreeSet,
|
||||
fs::File,
|
||||
io::{BufRead, BufReader},
|
||||
path::Path,
|
||||
};
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use serde::Deserialize;
|
||||
use tracing::warn;
|
||||
|
||||
use super::{ConvertedParagraph, ConvertedQuestion};
|
||||
|
||||
pub fn convert_nq(
|
||||
raw_path: &Path,
|
||||
include_unanswerable: bool,
|
||||
_context_token_limit: Option<usize>,
|
||||
) -> Result<Vec<ConvertedParagraph>> {
|
||||
#[allow(dead_code)]
|
||||
#[derive(Debug, Deserialize)]
|
||||
struct NqExample {
|
||||
question_text: String,
|
||||
document_title: String,
|
||||
example_id: i64,
|
||||
document_tokens: Vec<NqToken>,
|
||||
long_answer_candidates: Vec<NqLongAnswerCandidate>,
|
||||
annotations: Vec<NqAnnotation>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
struct NqToken {
|
||||
token: String,
|
||||
#[serde(default)]
|
||||
html_token: bool,
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
#[derive(Debug, Deserialize)]
|
||||
struct NqLongAnswerCandidate {
|
||||
start_token: i32,
|
||||
end_token: i32,
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
#[derive(Debug, Deserialize)]
|
||||
struct NqAnnotation {
|
||||
short_answers: Vec<NqShortAnswer>,
|
||||
#[serde(default)]
|
||||
yes_no_answer: String,
|
||||
long_answer: NqLongAnswer,
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
struct NqShortAnswer {
|
||||
start_token: i32,
|
||||
end_token: i32,
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
#[derive(Debug, Deserialize)]
|
||||
struct NqLongAnswer {
|
||||
candidate_index: i32,
|
||||
}
|
||||
|
||||
fn join_tokens(tokens: &[NqToken], start: usize, end: usize) -> String {
|
||||
let mut buffer = String::new();
|
||||
let end = end.min(tokens.len());
|
||||
for token in tokens.iter().skip(start).take(end.saturating_sub(start)) {
|
||||
if token.html_token {
|
||||
continue;
|
||||
}
|
||||
let text = token.token.trim();
|
||||
if text.is_empty() {
|
||||
continue;
|
||||
}
|
||||
let attach = matches!(
|
||||
text,
|
||||
"," | "." | "!" | "?" | ";" | ":" | ")" | "]" | "}" | "%" | "…" | "..."
|
||||
) || text.starts_with('\'')
|
||||
|| text == "n't"
|
||||
|| text == "'s"
|
||||
|| text == "'re"
|
||||
|| text == "'ve"
|
||||
|| text == "'d"
|
||||
|| text == "'ll";
|
||||
|
||||
if buffer.is_empty() || attach {
|
||||
buffer.push_str(text);
|
||||
} else {
|
||||
buffer.push(' ');
|
||||
buffer.push_str(text);
|
||||
}
|
||||
}
|
||||
|
||||
buffer.trim().to_string()
|
||||
}
|
||||
|
||||
let file = File::open(raw_path).with_context(|| {
|
||||
format!(
|
||||
"opening Natural Questions dataset at {}",
|
||||
raw_path.display()
|
||||
)
|
||||
})?;
|
||||
let reader = BufReader::new(file);
|
||||
|
||||
let mut paragraphs = Vec::new();
|
||||
for (line_idx, line) in reader.lines().enumerate() {
|
||||
let line = line.with_context(|| {
|
||||
format!(
|
||||
"reading Natural Questions line {} from {}",
|
||||
line_idx + 1,
|
||||
raw_path.display()
|
||||
)
|
||||
})?;
|
||||
if line.trim().is_empty() {
|
||||
continue;
|
||||
}
|
||||
let example: NqExample = serde_json::from_str(&line).with_context(|| {
|
||||
format!(
|
||||
"parsing Natural Questions JSON (line {}) at {}",
|
||||
line_idx + 1,
|
||||
raw_path.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
let mut answer_texts: Vec<String> = Vec::new();
|
||||
let mut short_answer_texts: Vec<String> = Vec::new();
|
||||
let mut has_short_or_yesno = false;
|
||||
let mut has_short_answer = false;
|
||||
for annotation in &example.annotations {
|
||||
for short in &annotation.short_answers {
|
||||
if short.start_token < 0 || short.end_token <= short.start_token {
|
||||
continue;
|
||||
}
|
||||
let start = short.start_token as usize;
|
||||
let end = short.end_token as usize;
|
||||
if start >= example.document_tokens.len() || end > example.document_tokens.len() {
|
||||
continue;
|
||||
}
|
||||
let text = join_tokens(&example.document_tokens, start, end);
|
||||
if !text.is_empty() {
|
||||
answer_texts.push(text.clone());
|
||||
short_answer_texts.push(text);
|
||||
has_short_or_yesno = true;
|
||||
has_short_answer = true;
|
||||
}
|
||||
}
|
||||
|
||||
match annotation
|
||||
.yes_no_answer
|
||||
.trim()
|
||||
.to_ascii_lowercase()
|
||||
.as_str()
|
||||
{
|
||||
"yes" => {
|
||||
answer_texts.push("yes".to_string());
|
||||
has_short_or_yesno = true;
|
||||
}
|
||||
"no" => {
|
||||
answer_texts.push("no".to_string());
|
||||
has_short_or_yesno = true;
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
let mut answers = dedupe_strings(answer_texts);
|
||||
let is_unanswerable = !has_short_or_yesno || answers.is_empty();
|
||||
if is_unanswerable {
|
||||
if !include_unanswerable {
|
||||
continue;
|
||||
}
|
||||
answers.clear();
|
||||
}
|
||||
|
||||
let paragraph_id = format!("nq-{}", example.example_id);
|
||||
let question_id = format!("nq-{}", example.example_id);
|
||||
|
||||
let context = join_tokens(&example.document_tokens, 0, example.document_tokens.len());
|
||||
if context.is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
if has_short_answer && !short_answer_texts.is_empty() {
|
||||
let normalized_context = context.to_ascii_lowercase();
|
||||
let missing_answer = short_answer_texts.iter().any(|answer| {
|
||||
let needle = answer.trim().to_ascii_lowercase();
|
||||
!needle.is_empty() && !normalized_context.contains(&needle)
|
||||
});
|
||||
if missing_answer {
|
||||
warn!(
|
||||
question_id = %question_id,
|
||||
"Skipping Natural Questions example because answers were not found in the assembled context"
|
||||
);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
if !include_unanswerable && (!has_short_answer || short_answer_texts.is_empty()) {
|
||||
// yes/no-only questions are excluded by default unless --llm-mode is used
|
||||
continue;
|
||||
}
|
||||
|
||||
let question = ConvertedQuestion {
|
||||
id: question_id,
|
||||
question: example.question_text.trim().to_string(),
|
||||
answers,
|
||||
is_impossible: is_unanswerable,
|
||||
};
|
||||
|
||||
paragraphs.push(ConvertedParagraph {
|
||||
id: paragraph_id,
|
||||
title: example.document_title.trim().to_string(),
|
||||
context,
|
||||
questions: vec![question],
|
||||
});
|
||||
}
|
||||
|
||||
Ok(paragraphs)
|
||||
}
|
||||
|
||||
fn dedupe_strings<I>(values: I) -> Vec<String>
|
||||
where
|
||||
I: IntoIterator<Item = String>,
|
||||
{
|
||||
let mut set = BTreeSet::new();
|
||||
for value in values {
|
||||
let trimmed = value.trim();
|
||||
if !trimmed.is_empty() {
|
||||
set.insert(trimmed.to_string());
|
||||
}
|
||||
}
|
||||
set.into_iter().collect()
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user