131 lines
4.2 KiB
Python
131 lines
4.2 KiB
Python
"""
|
|
Hybrid-Retrieval Engine.
|
|
Phase 1: FTS-Keyword + Confidence-Reranking.
|
|
Phase 2: + Embedding + Fusion.
|
|
"""
|
|
|
|
from typing import List, Dict, Any, Optional
|
|
from .engram import Engram
|
|
from .store import EngramStore
|
|
|
|
|
|
class Retriever:
|
|
def __init__(self, store: EngramStore, chroma: Optional[object] = None):
|
|
self.store = store
|
|
self.chroma = chroma
|
|
|
|
def retrieve(
|
|
self,
|
|
query: str,
|
|
limit: int = 5,
|
|
min_confidence: float = 0.0,
|
|
source_filter: str = None,
|
|
tag_filter: str = None,
|
|
) -> List[Dict[str, Any]]:
|
|
results = []
|
|
keyword_results = self.store.search_text(query, limit=limit * 3)
|
|
for eg in keyword_results:
|
|
conf = eg.compute_confidence()
|
|
if conf < min_confidence:
|
|
continue
|
|
if source_filter and eg.metadata.get("source") != source_filter:
|
|
continue
|
|
if tag_filter and tag_filter not in eg.metadata.get("tags", []):
|
|
continue
|
|
eg.touch()
|
|
self.store.save(eg)
|
|
results.append({"engram": eg, "score": conf, "match_type": "keyword"})
|
|
results.sort(key=lambda r: r["score"], reverse=True)
|
|
return results[:limit]
|
|
|
|
def semantic_retrieve(
|
|
self,
|
|
query: str,
|
|
limit: int = 5,
|
|
min_confidence: float = 0.0,
|
|
) -> List[Dict[str, Any]]:
|
|
"""Semantische Suche via ChromaDB."""
|
|
if not self.chroma:
|
|
return []
|
|
chroma_results = self.chroma.query(query, top_k=limit * 3)
|
|
results = []
|
|
for r in chroma_results:
|
|
eg = self.store.get(r["id"])
|
|
if not eg:
|
|
continue
|
|
conf = eg.compute_confidence()
|
|
if conf < min_confidence:
|
|
continue
|
|
score = 1.0 - r.get("distance", 0)
|
|
results.append({"engram": eg, "score": score, "match_type": "semantic"})
|
|
results.sort(key=lambda r: r["score"], reverse=True)
|
|
return results[:limit]
|
|
|
|
def hybrid_retrieve(
|
|
self,
|
|
query: str,
|
|
limit: int = 5,
|
|
min_confidence: float = 0.0,
|
|
keyword_weight: float = 0.4,
|
|
semantic_weight: float = 0.6,
|
|
) -> List[Dict[str, Any]]:
|
|
"""
|
|
Fusion: Keyword + Semantic + Neural Score.
|
|
"""
|
|
kw_results = {
|
|
str(r["engram"].id): r
|
|
for r in self.retrieve(query, limit=limit * 3, min_confidence=min_confidence)
|
|
}
|
|
sem_results = {
|
|
str(r["engram"].id): r
|
|
for r in self.semantic_retrieve(query, limit=limit * 3, min_confidence=min_confidence)
|
|
}
|
|
|
|
all_ids = set(kw_results.keys()) | set(sem_results.keys())
|
|
|
|
fusion: List[Dict[str, Any]] = []
|
|
for eid in all_ids:
|
|
kw = kw_results.get(eid)
|
|
sem = sem_results.get(eid)
|
|
kw_score = kw["score"] if kw else 0.0
|
|
sem_score = sem["score"] if sem else 0.0
|
|
|
|
# Weighted fusion
|
|
mixed = keyword_weight * kw_score + semantic_weight * sem_score
|
|
|
|
# Neural/Confidence bonus
|
|
eg = kw["engram"] if kw else sem["engram"]
|
|
neural_bonus = eg.compute_confidence() * 0.1
|
|
|
|
final = min(1.0, mixed + neural_bonus)
|
|
|
|
match_type = "hybrid"
|
|
if kw and sem:
|
|
match_type = "hybrid"
|
|
elif sem:
|
|
match_type = "semantic"
|
|
else:
|
|
match_type = "keyword"
|
|
|
|
fusion.append({"engram": eg, "score": final, "match_type": match_type})
|
|
|
|
fusion.sort(key=lambda r: r["score"], reverse=True)
|
|
return fusion[:limit]
|
|
|
|
def related(self, engram_id: str, limit: int = 5) -> List[Engram]:
|
|
eg = self.store.get(engram_id)
|
|
if not eg:
|
|
return []
|
|
out = []
|
|
for lid in eg.links:
|
|
linked = self.store.get(str(lid))
|
|
if linked:
|
|
out.append(linked)
|
|
return sorted(out, key=lambda e: e.compute_confidence(), reverse=True)[:limit]
|
|
|
|
def recent(self, limit: int = 10) -> List[Engram]:
|
|
return self.store.get_all(limit=limit)
|
|
|
|
def stats(self) -> Dict[str, Any]:
|
|
return self.store.stats()
|