feat(complete): Phase 2-5 - Vektor-Embeddings, ChromaDB, Neural Scorer, Streamlit Dashboard, Graph-Visualisierung

This commit is contained in:
2026-05-25 09:43:04 +02:00
parent 08d21f8087
commit 59f4059cd8
6 changed files with 842 additions and 2 deletions

119
src/chroma_store.py Normal file
View File

@@ -0,0 +1,119 @@
"""
chroma_store.py - ChromaDB Vektor-Speicher für semantische Suche.
Erweitert den SQLite-Store um Vektor-ähnlichkeit.
"""
import json
from pathlib import Path
from typing import List, Optional, Dict, Any
from uuid import UUID
import chromadb
from chromadb.config import Settings
from .engram import Engram
from .embedder import encode
class ChromaStore:
"""
ChromaDB-basierter Vektor-Speicher.
Speichert Engramme als Vektoren mit Metadaten.
"""
def __init__(self, path: str = "data/chroma"):
self.path = Path(path)
self.path.mkdir(parents=True, exist_ok=True)
self.client = chromadb.PersistentClient(path=str(self.path))
self.collection = self.client.get_or_create_collection(
name="engrams",
metadata={"hnsw:space": "cosine"},
)
def _build_metadata(self, engram: Engram) -> Dict[str, Any]:
"""Serialisierte Metadaten für ChromaDB (nur primitives)."""
meta = engram.metadata.copy()
# ChromaDB akzeptiert nur Listen/Strings/Numbers/Bools
tags = meta.pop("tags", [])
if isinstance(tags, list):
meta["tags"] = ",".join(str(t) for t in tags)
meta.setdefault("source", "agent")
meta.setdefault("confidence", 0.5)
meta.setdefault("correctness", "unconfirmed")
# Hierarchy als JSON-String
if "hierarchy" in meta:
meta["hierarchy"] = json.dumps(meta["hierarchy"])
return meta
def add(self, engram: Engram, embedding: Optional[List[float]] = None) -> None:
"""Engramm mit Embedding zur Vektor-DB hinzufügen."""
eid = str(engram.id)
emb = embedding or engram.embedding
if emb is None:
emb = encode(engram.content)
if emb is None:
return
meta = self._build_metadata(engram)
meta["content"] = engram.content[:1000] # Chroma likes short strings
self.collection.add(
ids=[eid],
embeddings=[emb],
metadatas=[meta],
)
def update(self, engram: Engram, embedding: Optional[List[float]] = None) -> None:
"""Engramm aktualisieren."""
eid = str(engram.id)
emb = embedding or engram.embedding
if emb is None:
emb = encode(engram.content)
if emb is None:
return
meta = self._build_metadata(engram)
self.collection.update(
ids=[eid],
embeddings=[emb],
metadatas=[meta],
)
def delete(self, eid: str) -> None:
"""Engramm aus Vektor-DB entfernen."""
self.collection.delete(ids=[eid])
def query(self, text: str, top_k: int = 5, filters: Optional[Dict] = None) -> List[Dict[str, Any]]:
"""Semantische Suche."""
emb = encode(text)
if emb is None:
return []
results = self.collection.query(
query_embeddings=[emb],
n_results=top_k,
where=filters,
include=["metadatas", "distances", "documents"],
)
out = []
for i in range(len(results["ids"][0])):
out.append({
"id": results["ids"][0][i],
"distance": results["distances"][0][i],
"metadata": results["metadatas"][0][i],
})
return out
def get_by_id(self, eid: str) -> Optional[Dict[str, Any]]:
"""Einzelnes Engramm via ID."""
try:
r = self.collection.get(ids=[eid], include=["embeddings", "metadatas"])
if r and r["ids"]:
return {
"id": r["ids"][0],
"embedding": r["embeddings"][0] if "embeddings" in r else None,
"metadata": r["metadatas"][0] if "metadatas" in r else {},
}
except Exception as e:
print(f"[chroma_store] get_by_id failed: {e}")
return None
def count(self) -> int:
return self.collection.count()