Second-brain 2.0: hybrid retrieval, obsidian bridge, vector watermark, tests
This commit is contained in:
@@ -38,6 +38,12 @@ try:
|
||||
except ImportError:
|
||||
Retriever = None
|
||||
|
||||
# Chroma: optional (braucht chromadb)
|
||||
try:
|
||||
from src.chroma_store import ChromaStore
|
||||
except Exception:
|
||||
ChromaStore = None
|
||||
|
||||
|
||||
# --- Konfiguration ---
|
||||
BRAIN_DB = Path(__file__).parent.parent / "data" / "brain.sqlite"
|
||||
@@ -207,10 +213,34 @@ def enrich_context(topic: str, limit: int = 3) -> str:
|
||||
"""
|
||||
store = get_brain()
|
||||
|
||||
# Versuche Retriever (mit Embeddings), fallback auf einfache Textsuche
|
||||
# Versuche Hybrid-Retrieval (FTS + optional Vector), fallback auf Textsuche
|
||||
if Retriever:
|
||||
ret = Retriever(store)
|
||||
results = ret.retrieve(topic, limit=limit, min_confidence=0.3)
|
||||
chroma = None
|
||||
if ChromaStore:
|
||||
try:
|
||||
chroma = ChromaStore(path=str(Path(__file__).parent.parent / "data" / "chroma"))
|
||||
except Exception:
|
||||
chroma = None
|
||||
ret = Retriever(store, chroma=chroma)
|
||||
try:
|
||||
results = ret.hybrid_retrieve(topic, limit=limit * 3, min_confidence=0.3)
|
||||
except Exception:
|
||||
results = ret.retrieve(topic, limit=limit * 3, min_confidence=0.3)
|
||||
|
||||
# confirmed-first ranking
|
||||
def _rank(r):
|
||||
eg = r["engram"]
|
||||
confirmed = 1 if getattr(eg.correctness, "confirmed", False) else 0
|
||||
return (confirmed, float(r.get("score", 0.0)))
|
||||
|
||||
results.sort(key=_rank, reverse=True)
|
||||
|
||||
# If we have confirmed results, show only confirmed up to limit
|
||||
confirmed_only = [r for r in results if r["engram"].correctness.confirmed]
|
||||
if confirmed_only:
|
||||
results = confirmed_only[:limit]
|
||||
else:
|
||||
results = results[:limit]
|
||||
else:
|
||||
results_raw = store.search_text(topic, limit=limit)
|
||||
results = [{"engram": eg, "score": 0.5} for eg in results_raw]
|
||||
|
||||
@@ -7,12 +7,10 @@ Phase 2: + Embedding + Fusion.
|
||||
from typing import List, Dict, Any, Optional
|
||||
from .engram import Engram
|
||||
from .store import EngramStore
|
||||
from .chroma_store import ChromaStore
|
||||
from .embedder import encode
|
||||
|
||||
|
||||
class Retriever:
|
||||
def __init__(self, store: EngramStore, chroma: Optional[ChromaStore] = None):
|
||||
def __init__(self, store: EngramStore, chroma: Optional[object] = None):
|
||||
self.store = store
|
||||
self.chroma = chroma
|
||||
|
||||
@@ -50,7 +48,6 @@ class Retriever:
|
||||
if not self.chroma:
|
||||
return []
|
||||
chroma_results = self.chroma.query(query, top_k=limit * 3)
|
||||
eids = [r["id"] for r in chroma_results]
|
||||
results = []
|
||||
for r in chroma_results:
|
||||
eg = self.store.get(r["id"])
|
||||
|
||||
15
src/store.py
15
src/store.py
@@ -127,6 +127,14 @@ class EngramStore:
|
||||
).fetchall()
|
||||
return [self._row_to_engram(r) for r in rows]
|
||||
|
||||
def get_modified_since(self, iso_ts: str, limit: int = 5000) -> List[Engram]:
|
||||
"""Gibt Engramme zurück, deren `modified_at` nach `iso_ts` liegt."""
|
||||
rows = self._conn.execute(
|
||||
"SELECT * FROM engrams WHERE modified_at > ? ORDER BY modified_at ASC LIMIT ?",
|
||||
(iso_ts, limit),
|
||||
).fetchall()
|
||||
return [self._row_to_engram(r) for r in rows]
|
||||
|
||||
def delete(self, engram_id: str) -> bool:
|
||||
"""Löscht ein Engramm und alle Verknüpfungen."""
|
||||
rowid = self._conn.execute(
|
||||
@@ -239,6 +247,13 @@ class EngramStore:
|
||||
"links": json.loads(row["links_json"]),
|
||||
"hierarchy": json.loads(row["hierarchy_json"]),
|
||||
}
|
||||
# Keep Engram metadata timestamps aligned with DB columns so downstream
|
||||
# consumers (e.g. vector indexing watermarks) can rely on them.
|
||||
try:
|
||||
d["metadata"]["created"] = row["created_at"]
|
||||
d["metadata"]["modified"] = row["modified_at"]
|
||||
except Exception:
|
||||
pass
|
||||
emb = row["embedding_json"]
|
||||
if emb:
|
||||
d["embedding"] = json.loads(emb)
|
||||
|
||||
Reference in New Issue
Block a user