feat(complete): Phase 2-5 - Vektor-Embeddings, ChromaDB, Neural Scorer, Streamlit Dashboard, Graph-Visualisierung
This commit is contained in:
@@ -4,14 +4,17 @@ Phase 1: FTS-Keyword + Confidence-Reranking.
|
||||
Phase 2: + Embedding + Fusion.
|
||||
"""
|
||||
|
||||
from typing import List, Dict, Any
|
||||
from typing import List, Dict, Any, Optional
|
||||
from .engram import Engram
|
||||
from .store import EngramStore
|
||||
from .chroma_store import ChromaStore
|
||||
from .embedder import encode
|
||||
|
||||
|
||||
class Retriever:
|
||||
def __init__(self, store: EngramStore):
|
||||
def __init__(self, store: EngramStore, chroma: Optional[ChromaStore] = None):
|
||||
self.store = store
|
||||
self.chroma = chroma
|
||||
|
||||
def retrieve(
|
||||
self,
|
||||
@@ -37,6 +40,81 @@ class Retriever:
|
||||
results.sort(key=lambda r: r["score"], reverse=True)
|
||||
return results[:limit]
|
||||
|
||||
def semantic_retrieve(
|
||||
self,
|
||||
query: str,
|
||||
limit: int = 5,
|
||||
min_confidence: float = 0.0,
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Semantische Suche via ChromaDB."""
|
||||
if not self.chroma:
|
||||
return []
|
||||
chroma_results = self.chroma.query(query, top_k=limit * 3)
|
||||
eids = [r["id"] for r in chroma_results]
|
||||
results = []
|
||||
for r in chroma_results:
|
||||
eg = self.store.get(r["id"])
|
||||
if not eg:
|
||||
continue
|
||||
conf = eg.compute_confidence()
|
||||
if conf < min_confidence:
|
||||
continue
|
||||
score = 1.0 - r.get("distance", 0)
|
||||
results.append({"engram": eg, "score": score, "match_type": "semantic"})
|
||||
results.sort(key=lambda r: r["score"], reverse=True)
|
||||
return results[:limit]
|
||||
|
||||
def hybrid_retrieve(
|
||||
self,
|
||||
query: str,
|
||||
limit: int = 5,
|
||||
min_confidence: float = 0.0,
|
||||
keyword_weight: float = 0.4,
|
||||
semantic_weight: float = 0.6,
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Fusion: Keyword + Semantic + Neural Score.
|
||||
"""
|
||||
kw_results = {
|
||||
str(r["engram"].id): r
|
||||
for r in self.retrieve(query, limit=limit * 3, min_confidence=min_confidence)
|
||||
}
|
||||
sem_results = {
|
||||
str(r["engram"].id): r
|
||||
for r in self.semantic_retrieve(query, limit=limit * 3, min_confidence=min_confidence)
|
||||
}
|
||||
|
||||
all_ids = set(kw_results.keys()) | set(sem_results.keys())
|
||||
|
||||
fusion: List[Dict[str, Any]] = []
|
||||
for eid in all_ids:
|
||||
kw = kw_results.get(eid)
|
||||
sem = sem_results.get(eid)
|
||||
kw_score = kw["score"] if kw else 0.0
|
||||
sem_score = sem["score"] if sem else 0.0
|
||||
|
||||
# Weighted fusion
|
||||
mixed = keyword_weight * kw_score + semantic_weight * sem_score
|
||||
|
||||
# Neural/Confidence bonus
|
||||
eg = kw["engram"] if kw else sem["engram"]
|
||||
neural_bonus = eg.compute_confidence() * 0.1
|
||||
|
||||
final = min(1.0, mixed + neural_bonus)
|
||||
|
||||
match_type = "hybrid"
|
||||
if kw and sem:
|
||||
match_type = "hybrid"
|
||||
elif sem:
|
||||
match_type = "semantic"
|
||||
else:
|
||||
match_type = "keyword"
|
||||
|
||||
fusion.append({"engram": eg, "score": final, "match_type": match_type})
|
||||
|
||||
fusion.sort(key=lambda r: r["score"], reverse=True)
|
||||
return fusion[:limit]
|
||||
|
||||
def related(self, engram_id: str, limit: int = 5) -> List[Engram]:
|
||||
eg = self.store.get(engram_id)
|
||||
if not eg:
|
||||
|
||||
Reference in New Issue
Block a user