mirror of
https://github.com/perstarkse/minne.git
synced 2026-04-23 17:28:34 +02:00
feat: text splitting and storage
This commit is contained in:
131
Cargo.lock
generated
131
Cargo.lock
generated
@@ -261,7 +261,7 @@ checksum = "965c2d33e53cb6b267e148a4cb0760bc01f4904c1cd4bb4002a085bb016d1490"
|
|||||||
dependencies = [
|
dependencies = [
|
||||||
"proc-macro2",
|
"proc-macro2",
|
||||||
"quote",
|
"quote",
|
||||||
"syn 2.0.77",
|
"syn 2.0.87",
|
||||||
"synstructure",
|
"synstructure",
|
||||||
]
|
]
|
||||||
|
|
||||||
@@ -273,7 +273,7 @@ checksum = "7b18050c2cd6fe86c3a76584ef5e0baf286d038cda203eb6223df2cc413565f7"
|
|||||||
dependencies = [
|
dependencies = [
|
||||||
"proc-macro2",
|
"proc-macro2",
|
||||||
"quote",
|
"quote",
|
||||||
"syn 2.0.77",
|
"syn 2.0.87",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -391,7 +391,7 @@ dependencies = [
|
|||||||
"proc-macro2",
|
"proc-macro2",
|
||||||
"quote",
|
"quote",
|
||||||
"strum",
|
"strum",
|
||||||
"syn 2.0.77",
|
"syn 2.0.87",
|
||||||
"thiserror",
|
"thiserror",
|
||||||
]
|
]
|
||||||
|
|
||||||
@@ -535,7 +535,7 @@ checksum = "c7c24de15d275a1ecfd47a380fb4d5ec9bfe0933f309ed5e705b775596a3574d"
|
|||||||
dependencies = [
|
dependencies = [
|
||||||
"proc-macro2",
|
"proc-macro2",
|
||||||
"quote",
|
"quote",
|
||||||
"syn 2.0.77",
|
"syn 2.0.87",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -552,7 +552,7 @@ checksum = "a27b8a3a6e1a44fa4c8baf1f653e4172e81486d4941f2237e20dc2d0cf4ddff1"
|
|||||||
dependencies = [
|
dependencies = [
|
||||||
"proc-macro2",
|
"proc-macro2",
|
||||||
"quote",
|
"quote",
|
||||||
"syn 2.0.77",
|
"syn 2.0.87",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -572,6 +572,18 @@ version = "1.1.2"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0"
|
checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "auto_enums"
|
||||||
|
version = "0.8.6"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "459b77b7e855f875fd15f101064825cd79eb83185a961d66e6298560126facfb"
|
||||||
|
dependencies = [
|
||||||
|
"derive_utils",
|
||||||
|
"proc-macro2",
|
||||||
|
"quote",
|
||||||
|
"syn 2.0.87",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "autocfg"
|
name = "autocfg"
|
||||||
version = "1.3.0"
|
version = "1.3.0"
|
||||||
@@ -643,7 +655,7 @@ checksum = "57d123550fa8d071b7255cb0cc04dc302baa6c8c4a79f55701552684d8399bce"
|
|||||||
dependencies = [
|
dependencies = [
|
||||||
"proc-macro2",
|
"proc-macro2",
|
||||||
"quote",
|
"quote",
|
||||||
"syn 2.0.77",
|
"syn 2.0.87",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -675,7 +687,7 @@ dependencies = [
|
|||||||
"heck 0.5.0",
|
"heck 0.5.0",
|
||||||
"proc-macro-error",
|
"proc-macro-error",
|
||||||
"quote",
|
"quote",
|
||||||
"syn 2.0.77",
|
"syn 2.0.87",
|
||||||
"ubyte",
|
"ubyte",
|
||||||
]
|
]
|
||||||
|
|
||||||
@@ -870,7 +882,7 @@ dependencies = [
|
|||||||
"proc-macro-crate",
|
"proc-macro-crate",
|
||||||
"proc-macro2",
|
"proc-macro2",
|
||||||
"quote",
|
"quote",
|
||||||
"syn 2.0.77",
|
"syn 2.0.87",
|
||||||
"syn_derive",
|
"syn_derive",
|
||||||
]
|
]
|
||||||
|
|
||||||
@@ -1178,7 +1190,7 @@ dependencies = [
|
|||||||
"proc-macro2",
|
"proc-macro2",
|
||||||
"quote",
|
"quote",
|
||||||
"strsim",
|
"strsim",
|
||||||
"syn 2.0.77",
|
"syn 2.0.87",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -1189,7 +1201,7 @@ checksum = "d336a2a514f6ccccaa3e09b02d41d35330c07ddf03a62165fcec10bb561c7806"
|
|||||||
dependencies = [
|
dependencies = [
|
||||||
"darling_core",
|
"darling_core",
|
||||||
"quote",
|
"quote",
|
||||||
"syn 2.0.77",
|
"syn 2.0.87",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -1246,7 +1258,7 @@ checksum = "8034092389675178f570469e6c3b0465d3d30b4505c294a6550db47f3c17ad18"
|
|||||||
dependencies = [
|
dependencies = [
|
||||||
"proc-macro2",
|
"proc-macro2",
|
||||||
"quote",
|
"quote",
|
||||||
"syn 2.0.77",
|
"syn 2.0.87",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -1277,7 +1289,7 @@ dependencies = [
|
|||||||
"darling",
|
"darling",
|
||||||
"proc-macro2",
|
"proc-macro2",
|
||||||
"quote",
|
"quote",
|
||||||
"syn 2.0.77",
|
"syn 2.0.87",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -1287,7 +1299,18 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|||||||
checksum = "4abae7035bf79b9877b779505d8cf3749285b80c43941eda66604841889451dc"
|
checksum = "4abae7035bf79b9877b779505d8cf3749285b80c43941eda66604841889451dc"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"derive_builder_core",
|
"derive_builder_core",
|
||||||
"syn 2.0.77",
|
"syn 2.0.87",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "derive_utils"
|
||||||
|
version = "0.14.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "65f152f4b8559c4da5d574bafc7af85454d706b4c5fe8b530d508cacbb6807ea"
|
||||||
|
dependencies = [
|
||||||
|
"proc-macro2",
|
||||||
|
"quote",
|
||||||
|
"syn 2.0.87",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -1345,7 +1368,7 @@ checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0"
|
|||||||
dependencies = [
|
dependencies = [
|
||||||
"proc-macro2",
|
"proc-macro2",
|
||||||
"quote",
|
"quote",
|
||||||
"syn 2.0.77",
|
"syn 2.0.87",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -1643,7 +1666,7 @@ checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650"
|
|||||||
dependencies = [
|
dependencies = [
|
||||||
"proc-macro2",
|
"proc-macro2",
|
||||||
"quote",
|
"quote",
|
||||||
"syn 2.0.77",
|
"syn 2.0.87",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -1864,7 +1887,7 @@ dependencies = [
|
|||||||
"markup5ever",
|
"markup5ever",
|
||||||
"proc-macro2",
|
"proc-macro2",
|
||||||
"quote",
|
"quote",
|
||||||
"syn 2.0.77",
|
"syn 2.0.87",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -2350,7 +2373,7 @@ checksum = "49e7bc1560b95a3c4a25d03de42fe76ca718ab92d1a22a55b9b4cf67b3ae635c"
|
|||||||
dependencies = [
|
dependencies = [
|
||||||
"proc-macro2",
|
"proc-macro2",
|
||||||
"quote",
|
"quote",
|
||||||
"syn 2.0.77",
|
"syn 2.0.87",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -2419,7 +2442,7 @@ dependencies = [
|
|||||||
"cfg-if",
|
"cfg-if",
|
||||||
"proc-macro2",
|
"proc-macro2",
|
||||||
"quote",
|
"quote",
|
||||||
"syn 2.0.77",
|
"syn 2.0.87",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -2616,9 +2639,9 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "once_cell"
|
name = "once_cell"
|
||||||
version = "1.19.0"
|
version = "1.20.2"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92"
|
checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "openssl-probe"
|
name = "openssl-probe"
|
||||||
@@ -2824,7 +2847,7 @@ dependencies = [
|
|||||||
"phf_shared 0.11.2",
|
"phf_shared 0.11.2",
|
||||||
"proc-macro2",
|
"proc-macro2",
|
||||||
"quote",
|
"quote",
|
||||||
"syn 2.0.77",
|
"syn 2.0.87",
|
||||||
"unicase",
|
"unicase",
|
||||||
]
|
]
|
||||||
|
|
||||||
@@ -2870,7 +2893,7 @@ checksum = "2f38a4412a78282e09a2cf38d195ea5420d15ba0602cb375210efbc877243965"
|
|||||||
dependencies = [
|
dependencies = [
|
||||||
"proc-macro2",
|
"proc-macro2",
|
||||||
"quote",
|
"quote",
|
||||||
"syn 2.0.77",
|
"syn 2.0.87",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -3278,7 +3301,7 @@ checksum = "bcc303e793d3734489387d205e9b186fac9c6cfacedd98cbb2e8a5943595f3e6"
|
|||||||
dependencies = [
|
dependencies = [
|
||||||
"proc-macro2",
|
"proc-macro2",
|
||||||
"quote",
|
"quote",
|
||||||
"syn 2.0.77",
|
"syn 2.0.87",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -3419,7 +3442,7 @@ checksum = "5f0ec466e5d8dca9965eb6871879677bef5590cf7525ad96cae14376efb75073"
|
|||||||
dependencies = [
|
dependencies = [
|
||||||
"proc-macro2",
|
"proc-macro2",
|
||||||
"quote",
|
"quote",
|
||||||
"syn 2.0.77",
|
"syn 2.0.87",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -3828,7 +3851,7 @@ checksum = "243902eda00fad750862fc144cea25caca5e20d615af0a81bee94ca738f1df1f"
|
|||||||
dependencies = [
|
dependencies = [
|
||||||
"proc-macro2",
|
"proc-macro2",
|
||||||
"quote",
|
"quote",
|
||||||
"syn 2.0.77",
|
"syn 2.0.87",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -3893,7 +3916,7 @@ dependencies = [
|
|||||||
"darling",
|
"darling",
|
||||||
"proc-macro2",
|
"proc-macro2",
|
||||||
"quote",
|
"quote",
|
||||||
"syn 2.0.77",
|
"syn 2.0.87",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -4157,7 +4180,7 @@ dependencies = [
|
|||||||
"proc-macro2",
|
"proc-macro2",
|
||||||
"quote",
|
"quote",
|
||||||
"rustversion",
|
"rustversion",
|
||||||
"syn 2.0.77",
|
"syn 2.0.87",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -4312,9 +4335,9 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "syn"
|
name = "syn"
|
||||||
version = "2.0.77"
|
version = "2.0.87"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "9f35bcdf61fd8e7be6caf75f429fdca8beb3ed76584befb503b1569faee373ed"
|
checksum = "25aa4ce346d03a6dcd68dd8b4010bcb74e54e62c90c573f394c46eae99aba32d"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"proc-macro2",
|
"proc-macro2",
|
||||||
"quote",
|
"quote",
|
||||||
@@ -4330,7 +4353,7 @@ dependencies = [
|
|||||||
"proc-macro-error",
|
"proc-macro-error",
|
||||||
"proc-macro2",
|
"proc-macro2",
|
||||||
"quote",
|
"quote",
|
||||||
"syn 2.0.77",
|
"syn 2.0.87",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -4356,7 +4379,7 @@ checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971"
|
|||||||
dependencies = [
|
dependencies = [
|
||||||
"proc-macro2",
|
"proc-macro2",
|
||||||
"quote",
|
"quote",
|
||||||
"syn 2.0.77",
|
"syn 2.0.87",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -4419,23 +4442,40 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|||||||
checksum = "3369f5ac52d5eb6ab48c6b4ffdc8efbcad6b89c765749064ba298f2c68a16a76"
|
checksum = "3369f5ac52d5eb6ab48c6b4ffdc8efbcad6b89c765749064ba298f2c68a16a76"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "thiserror"
|
name = "text-splitter"
|
||||||
version = "1.0.63"
|
version = "0.18.1"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "c0342370b38b6a11b6cc11d6a805569958d54cfa061a29969c3b5ce2ea405724"
|
checksum = "189450e9eaff1a8037cca4d60ca62c134a7e601187430bdd487c86e25e8d6641"
|
||||||
|
dependencies = [
|
||||||
|
"ahash 0.8.11",
|
||||||
|
"auto_enums",
|
||||||
|
"either",
|
||||||
|
"itertools 0.13.0",
|
||||||
|
"once_cell",
|
||||||
|
"regex",
|
||||||
|
"strum",
|
||||||
|
"thiserror",
|
||||||
|
"unicode-segmentation",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "thiserror"
|
||||||
|
version = "1.0.69"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"thiserror-impl",
|
"thiserror-impl",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "thiserror-impl"
|
name = "thiserror-impl"
|
||||||
version = "1.0.63"
|
version = "1.0.69"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "a4558b58466b9ad7ca0f102865eccc95938dca1a74a856f2b57b6629050da261"
|
checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"proc-macro2",
|
"proc-macro2",
|
||||||
"quote",
|
"quote",
|
||||||
"syn 2.0.77",
|
"syn 2.0.87",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -4529,7 +4569,7 @@ checksum = "693d596312e88961bc67d7f1f97af8a70227d9f90c31bba5806eec004978d752"
|
|||||||
dependencies = [
|
dependencies = [
|
||||||
"proc-macro2",
|
"proc-macro2",
|
||||||
"quote",
|
"quote",
|
||||||
"syn 2.0.77",
|
"syn 2.0.87",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -4649,7 +4689,7 @@ checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7"
|
|||||||
dependencies = [
|
dependencies = [
|
||||||
"proc-macro2",
|
"proc-macro2",
|
||||||
"quote",
|
"quote",
|
||||||
"syn 2.0.77",
|
"syn 2.0.87",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -4805,6 +4845,12 @@ dependencies = [
|
|||||||
"unicode-script",
|
"unicode-script",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "unicode-segmentation"
|
||||||
|
version = "1.12.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "unicode-width"
|
name = "unicode-width"
|
||||||
version = "0.1.14"
|
version = "0.1.14"
|
||||||
@@ -4923,7 +4969,7 @@ dependencies = [
|
|||||||
"once_cell",
|
"once_cell",
|
||||||
"proc-macro2",
|
"proc-macro2",
|
||||||
"quote",
|
"quote",
|
||||||
"syn 2.0.77",
|
"syn 2.0.87",
|
||||||
"wasm-bindgen-shared",
|
"wasm-bindgen-shared",
|
||||||
]
|
]
|
||||||
|
|
||||||
@@ -4957,7 +5003,7 @@ checksum = "afc340c74d9005395cf9dd098506f7f44e38f2b4a21c6aaacf9a105ea5e1e836"
|
|||||||
dependencies = [
|
dependencies = [
|
||||||
"proc-macro2",
|
"proc-macro2",
|
||||||
"quote",
|
"quote",
|
||||||
"syn 2.0.77",
|
"syn 2.0.87",
|
||||||
"wasm-bindgen-backend",
|
"wasm-bindgen-backend",
|
||||||
"wasm-bindgen-shared",
|
"wasm-bindgen-shared",
|
||||||
]
|
]
|
||||||
@@ -5324,7 +5370,7 @@ checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e"
|
|||||||
dependencies = [
|
dependencies = [
|
||||||
"proc-macro2",
|
"proc-macro2",
|
||||||
"quote",
|
"quote",
|
||||||
"syn 2.0.77",
|
"syn 2.0.87",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -5351,6 +5397,7 @@ dependencies = [
|
|||||||
"sha2",
|
"sha2",
|
||||||
"surrealdb",
|
"surrealdb",
|
||||||
"tempfile",
|
"tempfile",
|
||||||
|
"text-splitter",
|
||||||
"thiserror",
|
"thiserror",
|
||||||
"tokio",
|
"tokio",
|
||||||
"tracing",
|
"tracing",
|
||||||
|
|||||||
@@ -18,6 +18,7 @@ serde_json = "1.0.128"
|
|||||||
sha2 = "0.10.8"
|
sha2 = "0.10.8"
|
||||||
surrealdb = "2.0.4"
|
surrealdb = "2.0.4"
|
||||||
tempfile = "3.12.0"
|
tempfile = "3.12.0"
|
||||||
|
text-splitter = "0.18.1"
|
||||||
thiserror = "1.0.63"
|
thiserror = "1.0.63"
|
||||||
tokio = { version = "1.40.0", features = ["full"] }
|
tokio = { version = "1.40.0", features = ["full"] }
|
||||||
tracing = "0.1.40"
|
tracing = "0.1.40"
|
||||||
|
|||||||
@@ -18,7 +18,7 @@ pub struct KnowledgeEntity {
|
|||||||
pub embedding: Option<Vec<f32>>,
|
pub embedding: Option<Vec<f32>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
fn thing_to_string<'de, D>(deserializer: D) -> Result<String, D::Error>
|
pub fn thing_to_string<'de, D>(deserializer: D) -> Result<String, D::Error>
|
||||||
where
|
where
|
||||||
D: Deserializer<'de>,
|
D: Deserializer<'de>,
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -38,7 +38,7 @@ impl IngressObject {
|
|||||||
let text = Self::fetch_text_from_url(url).await?;
|
let text = Self::fetch_text_from_url(url).await?;
|
||||||
let id = Uuid::new_v4();
|
let id = Uuid::new_v4();
|
||||||
Ok(TextContent {
|
Ok(TextContent {
|
||||||
id,
|
id: id.to_string(),
|
||||||
text,
|
text,
|
||||||
instructions: instructions.clone(),
|
instructions: instructions.clone(),
|
||||||
category: category.clone(),
|
category: category.clone(),
|
||||||
@@ -48,7 +48,7 @@ impl IngressObject {
|
|||||||
IngressObject::Text { text, instructions, category } => {
|
IngressObject::Text { text, instructions, category } => {
|
||||||
let id = Uuid::new_v4();
|
let id = Uuid::new_v4();
|
||||||
Ok(TextContent {
|
Ok(TextContent {
|
||||||
id,
|
id: id.to_string(),
|
||||||
text: text.clone(),
|
text: text.clone(),
|
||||||
instructions: instructions.clone(),
|
instructions: instructions.clone(),
|
||||||
category: category.clone(),
|
category: category.clone(),
|
||||||
@@ -59,7 +59,7 @@ impl IngressObject {
|
|||||||
let id = Uuid::new_v4();
|
let id = Uuid::new_v4();
|
||||||
let text = Self::extract_text_from_file(file_info).await?;
|
let text = Self::extract_text_from_file(file_info).await?;
|
||||||
Ok(TextContent {
|
Ok(TextContent {
|
||||||
id,
|
id: id.to_string(),
|
||||||
text,
|
text,
|
||||||
instructions: instructions.clone(),
|
instructions: instructions.clone(),
|
||||||
category: category.clone(),
|
category: category.clone(),
|
||||||
|
|||||||
@@ -1,17 +1,32 @@
|
|||||||
|
use crate::{
|
||||||
|
models::file_info::FileInfo,
|
||||||
|
surrealdb::{SurrealDbClient, SurrealError},
|
||||||
|
utils::llm::{create_json_ld, generate_embedding},
|
||||||
|
};
|
||||||
use async_openai::error::OpenAIError;
|
use async_openai::error::OpenAIError;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use surrealdb::{engine::remote::ws::Client, Surreal};
|
use surrealdb::{engine::remote::ws::Client, Surreal};
|
||||||
|
use text_splitter::TextSplitter;
|
||||||
|
use thiserror::Error;
|
||||||
use tracing::{debug, info};
|
use tracing::{debug, info};
|
||||||
use uuid::Uuid;
|
use uuid::Uuid;
|
||||||
use crate::{models::file_info::FileInfo, surrealdb::{SurrealDbClient, SurrealError}, utils::llm::create_json_ld};
|
|
||||||
use thiserror::Error;
|
|
||||||
|
|
||||||
use super::graph_entities::{KnowledgeEntity, KnowledgeRelationship};
|
use super::graph_entities::{thing_to_string, KnowledgeEntity, KnowledgeRelationship};
|
||||||
|
|
||||||
|
#[derive(Serialize, Deserialize, Debug)]
|
||||||
|
struct TextChunk {
|
||||||
|
#[serde(deserialize_with = "thing_to_string")]
|
||||||
|
id: String,
|
||||||
|
source_id: String,
|
||||||
|
chunk: String,
|
||||||
|
embedding: Vec<f32>,
|
||||||
|
}
|
||||||
|
|
||||||
/// Represents a single piece of text content extracted from various sources.
|
/// Represents a single piece of text content extracted from various sources.
|
||||||
#[derive(Debug, Serialize, Deserialize, Clone)]
|
#[derive(Debug, Serialize, Deserialize, Clone)]
|
||||||
pub struct TextContent {
|
pub struct TextContent {
|
||||||
pub id: Uuid,
|
#[serde(deserialize_with = "thing_to_string")]
|
||||||
|
pub id: String,
|
||||||
pub text: String,
|
pub text: String,
|
||||||
pub file_info: Option<FileInfo>,
|
pub file_info: Option<FileInfo>,
|
||||||
pub instructions: String,
|
pub instructions: String,
|
||||||
@@ -26,13 +41,13 @@ pub enum ProcessingError {
|
|||||||
|
|
||||||
#[error("SurrealDB error: {0}")]
|
#[error("SurrealDB error: {0}")]
|
||||||
SurrealError(#[from] SurrealError),
|
SurrealError(#[from] SurrealError),
|
||||||
|
|
||||||
#[error("SurrealDb error: {0}")]
|
#[error("SurrealDb error: {0}")]
|
||||||
SurrealDbError(#[from] surrealdb::Error),
|
SurrealDbError(#[from] surrealdb::Error),
|
||||||
|
|
||||||
#[error("Graph DB storage error: {0}")]
|
#[error("Graph DB storage error: {0}")]
|
||||||
GraphDBError(String),
|
GraphDBError(String),
|
||||||
|
|
||||||
#[error("Vector DB storage error: {0}")]
|
#[error("Vector DB storage error: {0}")]
|
||||||
VectorDBError(String),
|
VectorDBError(String),
|
||||||
|
|
||||||
@@ -43,39 +58,106 @@ pub enum ProcessingError {
|
|||||||
OpenAIerror(#[from] OpenAIError),
|
OpenAIerror(#[from] OpenAIError),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async fn vector_comparison<T>(
|
||||||
|
take: u8,
|
||||||
|
input_text: String,
|
||||||
|
db_client: &Surreal<Client>,
|
||||||
|
table: String,
|
||||||
|
openai_client: &async_openai::Client<async_openai::config::OpenAIConfig>,
|
||||||
|
) -> Result<Vec<T>, ProcessingError>
|
||||||
|
where
|
||||||
|
T: for<'de> serde::Deserialize<'de>, // Add this trait bound for deserialization
|
||||||
|
{
|
||||||
|
let input_embedding = generate_embedding(&openai_client, input_text).await?;
|
||||||
|
|
||||||
|
// Construct the query
|
||||||
|
let closest_query = format!("SELECT *, vector::distance::knn() AS distance FROM {} WHERE embedding <|{},40|> {:?} ORDER BY distance",table, take, input_embedding);
|
||||||
|
|
||||||
|
// Perform query and deserialize to struct
|
||||||
|
let closest_entities: Vec<T> = db_client.query(closest_query).await?.take(0)?;
|
||||||
|
|
||||||
|
Ok(closest_entities)
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn get_related_nodes(
|
||||||
|
id: String,
|
||||||
|
db_client: &Surreal<Client>,
|
||||||
|
) -> Result<Vec<KnowledgeEntity>, ProcessingError> {
|
||||||
|
let query = format!("SELECT -> knowledge_relationship -> knowledge_entity as related_nodes FROM knowledge_entity WHERE source_id = `{}`", id);
|
||||||
|
|
||||||
|
// let query = format!("SELECT * FROM knowledge_entity WHERE in OR out {}", id);
|
||||||
|
let related_nodes: Vec<KnowledgeEntity> = db_client.query(query).await?.take(0)?;
|
||||||
|
|
||||||
|
Ok(related_nodes)
|
||||||
|
}
|
||||||
|
|
||||||
impl TextContent {
|
impl TextContent {
|
||||||
/// Processes the `TextContent` by sending it to an LLM, storing in a graph DB, and vector DB.
|
/// Processes the `TextContent` by sending it to an LLM, storing in a graph DB, and vector DB.
|
||||||
pub async fn process(&self) -> Result<(), ProcessingError> {
|
pub async fn process(&self) -> Result<(), ProcessingError> {
|
||||||
// Store TextContent
|
// Store TextContent
|
||||||
let db_client = SurrealDbClient::new().await?;
|
let db_client = SurrealDbClient::new().await?;
|
||||||
|
let openai_client = async_openai::Client::new();
|
||||||
|
|
||||||
// let deleted: Vec<KnowledgeEntity> = db_client.delete("knowledge_entity").await?;
|
self.store_text_content(&db_client).await?;
|
||||||
|
|
||||||
|
let closest_text_content: Vec<TextChunk> = vector_comparison(
|
||||||
|
4,
|
||||||
|
self.text.clone(),
|
||||||
|
&db_client,
|
||||||
|
"text_chunk".to_string(),
|
||||||
|
&openai_client,
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
for node in closest_text_content {
|
||||||
|
info!("{}-{}", node.id, node.source_id);
|
||||||
|
let related_nodes = get_related_nodes(node.source_id, &db_client).await?;
|
||||||
|
info!("{:?}", related_nodes);
|
||||||
|
}
|
||||||
|
|
||||||
|
panic!("STOPPING");
|
||||||
|
// let deleted: Vec<TextChunk> = db_client.delete("text_chunk").await?;
|
||||||
// info! {"{:?} KnowledgeEntities deleted", deleted.len()};
|
// info! {"{:?} KnowledgeEntities deleted", deleted.len()};
|
||||||
|
|
||||||
// let relationships_deleted: Vec<KnowledgeRelationship> =
|
// let relationships_deleted: Vec<KnowledgeRelationship> =
|
||||||
// db_client.delete("knowledge_relationship").await?;
|
// db_client.delete("knowledge_relationship").await?;
|
||||||
// info!("{:?} Relationships deleted", relationships_deleted.len());
|
// info!("{:?} Relationships deleted", relationships_deleted.len());
|
||||||
|
|
||||||
// panic!("STOP");
|
// panic!("STOP");
|
||||||
|
|
||||||
// db_client.query("REMOVE INDEX embeddings ON knowledge_entity").await?;
|
// db_client.query("REMOVE INDEX embeddings ON knowledge_entity").await?;
|
||||||
// db_client.query("DEFINE INDEX embeddings ON knowledge_entity FIELDS embedding HNSW DIMENSION 1536").await?;
|
// db_client
|
||||||
db_client.query("REBUILD INDEX IF EXISTS embeddings ON knowledge_entity").await?;
|
// .query("DEFINE INDEX idx_embedding ON text_chunk FIELDS embedding HNSW DIMENSION 1536")
|
||||||
|
// .await?;
|
||||||
|
db_client
|
||||||
|
.query("REBUILD INDEX IF EXISTS idx_embedding ON text_chunk")
|
||||||
|
.await?;
|
||||||
|
db_client
|
||||||
|
.query("REBUILD INDEX IF EXISTS embeddings ON knowledge_entity")
|
||||||
|
.await?;
|
||||||
|
|
||||||
// Step 1: Send to LLM for analysis
|
// Step 1: Send to LLM for analysis
|
||||||
let analysis = create_json_ld(&self.category, &self.instructions, &self.text, &db_client).await?;
|
let analysis = create_json_ld(
|
||||||
|
&self.category,
|
||||||
|
&self.instructions,
|
||||||
|
&self.text,
|
||||||
|
&db_client,
|
||||||
|
&openai_client,
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
// info!("{:#?}", &analysis);
|
// info!("{:#?}", &analysis);
|
||||||
|
|
||||||
// Step 2: Convert LLM analysis to database entities
|
// Step 2: Convert LLM analysis to database entities
|
||||||
let (entities, relationships) = analysis.to_database_entities(&self.id).await?;
|
let (entities, relationships) = analysis
|
||||||
|
.to_database_entities(&self.id, &openai_client)
|
||||||
|
.await?;
|
||||||
|
|
||||||
// Step 3: Store in database
|
// Step 3: Store in database
|
||||||
self.store_in_graph_db(entities, relationships, &db_client).await?;
|
self.store_in_graph_db(entities, relationships, &db_client)
|
||||||
|
.await?;
|
||||||
|
|
||||||
// Step 4: Split text and store in Vector DB
|
// Step 4: Split text and store in Vector DB
|
||||||
// self.store_in_vector_db().await?;
|
self.store_in_vector_db(&db_client, &openai_client).await?;
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@@ -87,14 +169,17 @@ impl TextContent {
|
|||||||
db_client: &Surreal<Client>,
|
db_client: &Surreal<Client>,
|
||||||
) -> Result<(), ProcessingError> {
|
) -> Result<(), ProcessingError> {
|
||||||
for entity in &entities {
|
for entity in &entities {
|
||||||
info!("{:?}, {:?}, {:?}", &entity.id, &entity.name, &entity.description);
|
info!(
|
||||||
|
"{:?}, {:?}, {:?}",
|
||||||
|
&entity.id, &entity.name, &entity.description
|
||||||
|
);
|
||||||
|
|
||||||
let _created: Option<KnowledgeEntity> = db_client
|
let _created: Option<KnowledgeEntity> = db_client
|
||||||
.create(("knowledge_entity", &entity.id.to_string()))
|
.create(("knowledge_entity", &entity.id.to_string()))
|
||||||
.content(entity.clone())
|
.content(entity.clone())
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
debug!("{:?}",_created);
|
debug!("{:?}", _created);
|
||||||
}
|
}
|
||||||
|
|
||||||
for relationship in &relationships {
|
for relationship in &relationships {
|
||||||
@@ -105,13 +190,13 @@ impl TextContent {
|
|||||||
.content(relationship.clone())
|
.content(relationship.clone())
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
debug!("{:?}",_created);
|
debug!("{:?}", _created);
|
||||||
}
|
}
|
||||||
|
|
||||||
// for relationship in &relationships {
|
// for relationship in &relationships {
|
||||||
// let in_entity: Option<KnowledgeEntity> = db_client.select(("knowledge_entity",relationship.in_.to_string())).await?;
|
// let in_entity: Option<KnowledgeEntity> = db_client.select(("knowledge_entity",relationship.in_.to_string())).await?;
|
||||||
// let out_entity: Option<KnowledgeEntity> = db_client.select(("knowledge_entity", relationship.out.to_string())).await?;
|
// let out_entity: Option<KnowledgeEntity> = db_client.select(("knowledge_entity", relationship.out.to_string())).await?;
|
||||||
|
|
||||||
// if let (Some(in_), Some(out)) = (in_entity, out_entity) {
|
// if let (Some(in_), Some(out)) = (in_entity, out_entity) {
|
||||||
// info!("{} - {} is {} to {} - {}", in_.id, in_.name, relationship.relationship_type, out.id, out.name);
|
// info!("{} - {} is {} to {} - {}", in_.id, in_.name, relationship.relationship_type, out.id, out.name);
|
||||||
// }
|
// }
|
||||||
@@ -120,24 +205,59 @@ impl TextContent {
|
|||||||
// }
|
// }
|
||||||
// }
|
// }
|
||||||
|
|
||||||
info!("Inserted to database: {:?} entities, {:?} relationships", entities.len(), relationships.len());
|
info!(
|
||||||
|
"Inserted to database: {:?} entities, {:?} relationships",
|
||||||
|
entities.len(),
|
||||||
|
relationships.len()
|
||||||
|
);
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/// Splits text and stores it in a vector database.
|
/// Splits text and stores it in a vector database.
|
||||||
#[allow(dead_code)]
|
#[allow(dead_code)]
|
||||||
async fn store_in_vector_db(&self) -> Result<(), ProcessingError> {
|
async fn store_in_vector_db(
|
||||||
// TODO: Implement text splitting and vector storage logic.
|
&self,
|
||||||
// Example:
|
db_client: &Surreal<Client>,
|
||||||
/*
|
openai_client: &async_openai::Client<async_openai::config::OpenAIConfig>,
|
||||||
let chunks = text_splitter::split(&self.text);
|
) -> Result<(), ProcessingError> {
|
||||||
let vector_db = VectorDB::new("http://vector-db:5000");
|
let max_characters = 500..2000;
|
||||||
|
let splitter = TextSplitter::new(max_characters);
|
||||||
|
|
||||||
|
let chunks = splitter.chunks(self.text.as_str());
|
||||||
|
|
||||||
for chunk in chunks {
|
for chunk in chunks {
|
||||||
vector_db.insert(chunk).await.map_err(|e| ProcessingError::VectorDBError(e.to_string()))?;
|
info!("Chunk: {}", chunk);
|
||||||
|
let embedding = generate_embedding(&openai_client, chunk.to_string()).await?;
|
||||||
|
let text_chunk = TextChunk {
|
||||||
|
id: Uuid::new_v4().to_string(),
|
||||||
|
source_id: self.id.clone(),
|
||||||
|
chunk: chunk.to_string(),
|
||||||
|
embedding,
|
||||||
|
};
|
||||||
|
|
||||||
|
info!("{:?}", text_chunk);
|
||||||
|
|
||||||
|
let _created: Option<TextChunk> = db_client
|
||||||
|
.create(("text_chunk", text_chunk.id.clone()))
|
||||||
|
.content(text_chunk)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
debug!("{:?}", _created);
|
||||||
}
|
}
|
||||||
*/
|
|
||||||
unimplemented!()
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Stores text content in database
|
||||||
|
async fn store_text_content(&self, db_client: &Surreal<Client>) -> Result<(), ProcessingError> {
|
||||||
|
let _created: Option<TextContent> = db_client
|
||||||
|
.create(("text_content", self.id.clone()))
|
||||||
|
.content(self.clone())
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
debug!("{:?}", _created);
|
||||||
|
|
||||||
|
Ok(())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -38,7 +38,7 @@ pub struct LLMGraphAnalysisResult {
|
|||||||
pub relationships: Vec<LLMRelationship>,
|
pub relationships: Vec<LLMRelationship>,
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn generate_embedding(
|
pub async fn generate_embedding(
|
||||||
client: &async_openai::Client<async_openai::config::OpenAIConfig>,
|
client: &async_openai::Client<async_openai::config::OpenAIConfig>,
|
||||||
input: String,
|
input: String,
|
||||||
) -> Result<Vec<f32>, ProcessingError> {
|
) -> Result<Vec<f32>, ProcessingError> {
|
||||||
@@ -73,13 +73,15 @@ impl LLMGraphAnalysisResult {
|
|||||||
/// # Arguments
|
/// # Arguments
|
||||||
///
|
///
|
||||||
/// * `source_id` - A UUID representing the source identifier.
|
/// * `source_id` - A UUID representing the source identifier.
|
||||||
|
/// * `openai_client` - OpenAI client for LLM calls.
|
||||||
///
|
///
|
||||||
/// # Returns
|
/// # Returns
|
||||||
///
|
///
|
||||||
/// * `Result<(Vec<KnowledgeEntity>, Vec<KnowledgeRelationship>), ProcessingError>` - A tuple containing vectors of `KnowledgeEntity` and `KnowledgeRelationship`.
|
/// * `Result<(Vec<KnowledgeEntity>, Vec<KnowledgeRelationship>), ProcessingError>` - A tuple containing vectors of `KnowledgeEntity` and `KnowledgeRelationship`.
|
||||||
pub async fn to_database_entities(
|
pub async fn to_database_entities(
|
||||||
&self,
|
&self,
|
||||||
source_id: &Uuid,
|
source_id: &String,
|
||||||
|
openai_client: &async_openai::Client<async_openai::config::OpenAIConfig>,
|
||||||
) -> Result<(Vec<KnowledgeEntity>, Vec<KnowledgeRelationship>), ProcessingError> {
|
) -> Result<(Vec<KnowledgeEntity>, Vec<KnowledgeRelationship>), ProcessingError> {
|
||||||
let mut mapper = GraphMapper::new();
|
let mut mapper = GraphMapper::new();
|
||||||
|
|
||||||
@@ -88,7 +90,6 @@ impl LLMGraphAnalysisResult {
|
|||||||
mapper.assign_id(&llm_entity.key);
|
mapper.assign_id(&llm_entity.key);
|
||||||
}
|
}
|
||||||
|
|
||||||
let openai_client = async_openai::Client::new();
|
|
||||||
|
|
||||||
let mut entities = vec![];
|
let mut entities = vec![];
|
||||||
|
|
||||||
@@ -154,15 +155,13 @@ pub async fn create_json_ld(
|
|||||||
instructions: &str,
|
instructions: &str,
|
||||||
text: &str,
|
text: &str,
|
||||||
db_client: &Surreal<Client>,
|
db_client: &Surreal<Client>,
|
||||||
|
openai_client: &async_openai::Client<async_openai::config::OpenAIConfig>,
|
||||||
) -> Result<LLMGraphAnalysisResult, ProcessingError> {
|
) -> Result<LLMGraphAnalysisResult, ProcessingError> {
|
||||||
// Initialize llm client
|
|
||||||
let client = async_openai::Client::new();
|
|
||||||
|
|
||||||
// Format the input for more cohesive comparison
|
// Format the input for more cohesive comparison
|
||||||
let input_text = format!("content: {:?}, category: {:?}, user_instructions: {:?}", text, category, instructions);
|
let input_text = format!("content: {:?}, category: {:?}, user_instructions: {:?}", text, category, instructions);
|
||||||
|
|
||||||
// Generate embedding of the input
|
// Generate embedding of the input
|
||||||
let input_embedding = generate_embedding(&client, input_text).await?;
|
let input_embedding = generate_embedding(&openai_client, input_text).await?;
|
||||||
|
|
||||||
let number_of_entities_to_get = 10;
|
let number_of_entities_to_get = 10;
|
||||||
|
|
||||||
@@ -276,6 +275,7 @@ pub async fn create_json_ld(
|
|||||||
6. You will be presented with a few existing KnowledgeEntities that are similar to the current ones. They will have an existing UUID. When creating relationships to these entities, use their UUID.
|
6. You will be presented with a few existing KnowledgeEntities that are similar to the current ones. They will have an existing UUID. When creating relationships to these entities, use their UUID.
|
||||||
7. Only create relationships between existing KnowledgeEntities.
|
7. Only create relationships between existing KnowledgeEntities.
|
||||||
8. Entities that exist already in the database should NOT be created again. If there is only a minor overlap, skip creating a new entity.
|
8. Entities that exist already in the database should NOT be created again. If there is only a minor overlap, skip creating a new entity.
|
||||||
|
9. A new relationship MUST include a newly created KnowledgeEntity.
|
||||||
"#;
|
"#;
|
||||||
|
|
||||||
let user_message = format!(
|
let user_message = format!(
|
||||||
@@ -297,7 +297,7 @@ pub async fn create_json_ld(
|
|||||||
.map_err(|e| ProcessingError::LLMError(e.to_string()))?;
|
.map_err(|e| ProcessingError::LLMError(e.to_string()))?;
|
||||||
|
|
||||||
// Send the request to OpenAI
|
// Send the request to OpenAI
|
||||||
let response = client
|
let response = openai_client
|
||||||
.chat()
|
.chat()
|
||||||
.create(request)
|
.create(request)
|
||||||
.await
|
.await
|
||||||
|
|||||||
Reference in New Issue
Block a user