Files
second-brain/src/chroma_store.py

122 lines
4.2 KiB
Python

"""
chroma_store.py - ChromaDB Vektor-Speicher für semantische Suche.
Erweitert den SQLite-Store um Vektor-ähnlichkeit.
"""
import json
from pathlib import Path
from typing import List, Optional, Dict, Any
from uuid import UUID
import chromadb
from chromadb.config import Settings
from .engram import Engram
from .embedder import encode
class ChromaStore:
"""
ChromaDB-basierter Vektor-Speicher.
Speichert Engramme als Vektoren mit Metadaten.
"""
def __init__(self, path: str = "data/chroma"):
self.path = Path(path)
self.path.mkdir(parents=True, exist_ok=True)
self.client = chromadb.PersistentClient(path=str(self.path))
self.collection = self.client.get_or_create_collection(
name="engrams",
metadata={"hnsw:space": "cosine"},
)
def _build_metadata(self, engram: Engram) -> Dict[str, Any]:
"""Serialisierte Metadaten für ChromaDB (nur primitiv/scalar/Str)."""
m = engram.metadata
safe: Dict[str, Any] = {}
# Nur explizit erlaubte Felder übernehmen
safe["source"] = str(m.get("source", "agent"))
safe["confidence"] = float(m.get("confidence", 0.5))
safe["grounding"] = int(m.get("grounding", 1))
tags = m.get("tags", [])
safe["tags"] = ",".join(str(t) for t in tags) if isinstance(tags, list) else str(tags)
safe["created"] = str(m.get("created", ""))
safe["modified"] = str(m.get("modified", ""))
safe["access_count"] = int(m.get("access_count", 0))
safe["correctness"] = "confirmed" if engram.correctness.confirmed else "unconfirmed"
safe["content"] = str(engram.content)[:500] # Chroma akzeptiert kurze Strings besser
return safe
def add(self, engram: Engram, embedding: Optional[List[float]] = None) -> None:
"""Engramm mit Embedding zur Vektor-DB hinzufügen."""
eid = str(engram.id)
emb = embedding or engram.embedding
if emb is None:
emb = encode(engram.content)
if emb is None:
return
meta = self._build_metadata(engram)
meta["content"] = engram.content[:1000] # Chroma likes short strings
self.collection.add(
ids=[eid],
embeddings=[emb],
metadatas=[meta],
)
def update(self, engram: Engram, embedding: Optional[List[float]] = None) -> None:
"""Engramm aktualisieren."""
eid = str(engram.id)
emb = embedding or engram.embedding
if emb is None:
emb = encode(engram.content)
if emb is None:
return
meta = self._build_metadata(engram)
self.collection.update(
ids=[eid],
embeddings=[emb],
metadatas=[meta],
)
def delete(self, eid: str) -> None:
"""Engramm aus Vektor-DB entfernen."""
self.collection.delete(ids=[eid])
def query(self, text: str, top_k: int = 5, filters: Optional[Dict] = None) -> List[Dict[str, Any]]:
"""Semantische Suche."""
emb = encode(text)
if emb is None:
return []
results = self.collection.query(
query_embeddings=[emb],
n_results=top_k,
where=filters,
include=["metadatas", "distances", "documents"],
)
out = []
for i in range(len(results["ids"][0])):
out.append({
"id": results["ids"][0][i],
"distance": results["distances"][0][i],
"metadata": results["metadatas"][0][i],
})
return out
def get_by_id(self, eid: str) -> Optional[Dict[str, Any]]:
"""Einzelnes Engramm via ID."""
try:
r = self.collection.get(ids=[eid], include=["embeddings", "metadatas"])
if r and r["ids"]:
return {
"id": r["ids"][0],
"embedding": r["embeddings"][0] if "embeddings" in r else None,
"metadata": r["metadatas"][0] if "metadatas" in r else {},
}
except Exception as e:
print(f"[chroma_store] get_by_id failed: {e}")
return None
def count(self) -> int:
return self.collection.count()