122 lines
4.2 KiB
Python
122 lines
4.2 KiB
Python
"""
|
|
chroma_store.py - ChromaDB Vektor-Speicher für semantische Suche.
|
|
Erweitert den SQLite-Store um Vektor-ähnlichkeit.
|
|
"""
|
|
|
|
import json
|
|
from pathlib import Path
|
|
from typing import List, Optional, Dict, Any
|
|
from uuid import UUID
|
|
|
|
import chromadb
|
|
from chromadb.config import Settings
|
|
|
|
from .engram import Engram
|
|
from .embedder import encode
|
|
|
|
|
|
class ChromaStore:
|
|
"""
|
|
ChromaDB-basierter Vektor-Speicher.
|
|
Speichert Engramme als Vektoren mit Metadaten.
|
|
"""
|
|
|
|
def __init__(self, path: str = "data/chroma"):
|
|
self.path = Path(path)
|
|
self.path.mkdir(parents=True, exist_ok=True)
|
|
self.client = chromadb.PersistentClient(path=str(self.path))
|
|
self.collection = self.client.get_or_create_collection(
|
|
name="engrams",
|
|
metadata={"hnsw:space": "cosine"},
|
|
)
|
|
|
|
def _build_metadata(self, engram: Engram) -> Dict[str, Any]:
|
|
"""Serialisierte Metadaten für ChromaDB (nur primitiv/scalar/Str)."""
|
|
m = engram.metadata
|
|
safe: Dict[str, Any] = {}
|
|
# Nur explizit erlaubte Felder übernehmen
|
|
safe["source"] = str(m.get("source", "agent"))
|
|
safe["confidence"] = float(m.get("confidence", 0.5))
|
|
safe["grounding"] = int(m.get("grounding", 1))
|
|
tags = m.get("tags", [])
|
|
safe["tags"] = ",".join(str(t) for t in tags) if isinstance(tags, list) else str(tags)
|
|
safe["created"] = str(m.get("created", ""))
|
|
safe["modified"] = str(m.get("modified", ""))
|
|
safe["access_count"] = int(m.get("access_count", 0))
|
|
safe["correctness"] = "confirmed" if engram.correctness.confirmed else "unconfirmed"
|
|
safe["content"] = str(engram.content)[:500] # Chroma akzeptiert kurze Strings besser
|
|
return safe
|
|
|
|
def add(self, engram: Engram, embedding: Optional[List[float]] = None) -> None:
|
|
"""Engramm mit Embedding zur Vektor-DB hinzufügen."""
|
|
eid = str(engram.id)
|
|
emb = embedding or engram.embedding
|
|
if emb is None:
|
|
emb = encode(engram.content)
|
|
if emb is None:
|
|
return
|
|
|
|
meta = self._build_metadata(engram)
|
|
meta["content"] = engram.content[:1000] # Chroma likes short strings
|
|
self.collection.add(
|
|
ids=[eid],
|
|
embeddings=[emb],
|
|
metadatas=[meta],
|
|
)
|
|
|
|
def update(self, engram: Engram, embedding: Optional[List[float]] = None) -> None:
|
|
"""Engramm aktualisieren."""
|
|
eid = str(engram.id)
|
|
emb = embedding or engram.embedding
|
|
if emb is None:
|
|
emb = encode(engram.content)
|
|
if emb is None:
|
|
return
|
|
meta = self._build_metadata(engram)
|
|
self.collection.update(
|
|
ids=[eid],
|
|
embeddings=[emb],
|
|
metadatas=[meta],
|
|
)
|
|
|
|
def delete(self, eid: str) -> None:
|
|
"""Engramm aus Vektor-DB entfernen."""
|
|
self.collection.delete(ids=[eid])
|
|
|
|
def query(self, text: str, top_k: int = 5, filters: Optional[Dict] = None) -> List[Dict[str, Any]]:
|
|
"""Semantische Suche."""
|
|
emb = encode(text)
|
|
if emb is None:
|
|
return []
|
|
results = self.collection.query(
|
|
query_embeddings=[emb],
|
|
n_results=top_k,
|
|
where=filters,
|
|
include=["metadatas", "distances", "documents"],
|
|
)
|
|
out = []
|
|
for i in range(len(results["ids"][0])):
|
|
out.append({
|
|
"id": results["ids"][0][i],
|
|
"distance": results["distances"][0][i],
|
|
"metadata": results["metadatas"][0][i],
|
|
})
|
|
return out
|
|
|
|
def get_by_id(self, eid: str) -> Optional[Dict[str, Any]]:
|
|
"""Einzelnes Engramm via ID."""
|
|
try:
|
|
r = self.collection.get(ids=[eid], include=["embeddings", "metadatas"])
|
|
if r and r["ids"]:
|
|
return {
|
|
"id": r["ids"][0],
|
|
"embedding": r["embeddings"][0] if "embeddings" in r else None,
|
|
"metadata": r["metadatas"][0] if "metadatas" in r else {},
|
|
}
|
|
except Exception as e:
|
|
print(f"[chroma_store] get_by_id failed: {e}")
|
|
return None
|
|
|
|
def count(self) -> int:
|
|
return self.collection.count()
|