Files
second-brain/src/retriever.py

131 lines
4.2 KiB
Python

"""
Hybrid-Retrieval Engine.
Phase 1: FTS-Keyword + Confidence-Reranking.
Phase 2: + Embedding + Fusion.
"""
from typing import List, Dict, Any, Optional
from .engram import Engram
from .store import EngramStore
class Retriever:
def __init__(self, store: EngramStore, chroma: Optional[object] = None):
self.store = store
self.chroma = chroma
def retrieve(
self,
query: str,
limit: int = 5,
min_confidence: float = 0.0,
source_filter: str = None,
tag_filter: str = None,
) -> List[Dict[str, Any]]:
results = []
keyword_results = self.store.search_text(query, limit=limit * 3)
for eg in keyword_results:
conf = eg.compute_confidence()
if conf < min_confidence:
continue
if source_filter and eg.metadata.get("source") != source_filter:
continue
if tag_filter and tag_filter not in eg.metadata.get("tags", []):
continue
eg.touch()
self.store.save(eg)
results.append({"engram": eg, "score": conf, "match_type": "keyword"})
results.sort(key=lambda r: r["score"], reverse=True)
return results[:limit]
def semantic_retrieve(
self,
query: str,
limit: int = 5,
min_confidence: float = 0.0,
) -> List[Dict[str, Any]]:
"""Semantische Suche via ChromaDB."""
if not self.chroma:
return []
chroma_results = self.chroma.query(query, top_k=limit * 3)
results = []
for r in chroma_results:
eg = self.store.get(r["id"])
if not eg:
continue
conf = eg.compute_confidence()
if conf < min_confidence:
continue
score = 1.0 - r.get("distance", 0)
results.append({"engram": eg, "score": score, "match_type": "semantic"})
results.sort(key=lambda r: r["score"], reverse=True)
return results[:limit]
def hybrid_retrieve(
self,
query: str,
limit: int = 5,
min_confidence: float = 0.0,
keyword_weight: float = 0.4,
semantic_weight: float = 0.6,
) -> List[Dict[str, Any]]:
"""
Fusion: Keyword + Semantic + Neural Score.
"""
kw_results = {
str(r["engram"].id): r
for r in self.retrieve(query, limit=limit * 3, min_confidence=min_confidence)
}
sem_results = {
str(r["engram"].id): r
for r in self.semantic_retrieve(query, limit=limit * 3, min_confidence=min_confidence)
}
all_ids = set(kw_results.keys()) | set(sem_results.keys())
fusion: List[Dict[str, Any]] = []
for eid in all_ids:
kw = kw_results.get(eid)
sem = sem_results.get(eid)
kw_score = kw["score"] if kw else 0.0
sem_score = sem["score"] if sem else 0.0
# Weighted fusion
mixed = keyword_weight * kw_score + semantic_weight * sem_score
# Neural/Confidence bonus
eg = kw["engram"] if kw else sem["engram"]
neural_bonus = eg.compute_confidence() * 0.1
final = min(1.0, mixed + neural_bonus)
match_type = "hybrid"
if kw and sem:
match_type = "hybrid"
elif sem:
match_type = "semantic"
else:
match_type = "keyword"
fusion.append({"engram": eg, "score": final, "match_type": match_type})
fusion.sort(key=lambda r: r["score"], reverse=True)
return fusion[:limit]
def related(self, engram_id: str, limit: int = 5) -> List[Engram]:
eg = self.store.get(engram_id)
if not eg:
return []
out = []
for lid in eg.links:
linked = self.store.get(str(lid))
if linked:
out.append(linked)
return sorted(out, key=lambda e: e.compute_confidence(), reverse=True)[:limit]
def recent(self, limit: int = 10) -> List[Engram]:
return self.store.get_all(limit=limit)
def stats(self) -> Dict[str, Any]:
return self.store.stats()