Second-brain 2.0: hybrid retrieval, obsidian bridge, vector watermark, tests

This commit is contained in:
2026-05-26 19:27:12 +02:00
parent 29bc45d623
commit e1640071e4
7 changed files with 291 additions and 16 deletions

View File

@@ -38,6 +38,12 @@ try:
except ImportError:
Retriever = None
# Chroma: optional (braucht chromadb)
try:
from src.chroma_store import ChromaStore
except Exception:
ChromaStore = None
# --- Konfiguration ---
BRAIN_DB = Path(__file__).parent.parent / "data" / "brain.sqlite"
@@ -207,10 +213,34 @@ def enrich_context(topic: str, limit: int = 3) -> str:
"""
store = get_brain()
# Versuche Retriever (mit Embeddings), fallback auf einfache Textsuche
# Versuche Hybrid-Retrieval (FTS + optional Vector), fallback auf Textsuche
if Retriever:
ret = Retriever(store)
results = ret.retrieve(topic, limit=limit, min_confidence=0.3)
chroma = None
if ChromaStore:
try:
chroma = ChromaStore(path=str(Path(__file__).parent.parent / "data" / "chroma"))
except Exception:
chroma = None
ret = Retriever(store, chroma=chroma)
try:
results = ret.hybrid_retrieve(topic, limit=limit * 3, min_confidence=0.3)
except Exception:
results = ret.retrieve(topic, limit=limit * 3, min_confidence=0.3)
# confirmed-first ranking
def _rank(r):
eg = r["engram"]
confirmed = 1 if getattr(eg.correctness, "confirmed", False) else 0
return (confirmed, float(r.get("score", 0.0)))
results.sort(key=_rank, reverse=True)
# If we have confirmed results, show only confirmed up to limit
confirmed_only = [r for r in results if r["engram"].correctness.confirmed]
if confirmed_only:
results = confirmed_only[:limit]
else:
results = results[:limit]
else:
results_raw = store.search_text(topic, limit=limit)
results = [{"engram": eg, "score": 0.5} for eg in results_raw]

View File

@@ -7,12 +7,10 @@ Phase 2: + Embedding + Fusion.
from typing import List, Dict, Any, Optional
from .engram import Engram
from .store import EngramStore
from .chroma_store import ChromaStore
from .embedder import encode
class Retriever:
def __init__(self, store: EngramStore, chroma: Optional[ChromaStore] = None):
def __init__(self, store: EngramStore, chroma: Optional[object] = None):
self.store = store
self.chroma = chroma
@@ -50,7 +48,6 @@ class Retriever:
if not self.chroma:
return []
chroma_results = self.chroma.query(query, top_k=limit * 3)
eids = [r["id"] for r in chroma_results]
results = []
for r in chroma_results:
eg = self.store.get(r["id"])

View File

@@ -127,6 +127,14 @@ class EngramStore:
).fetchall()
return [self._row_to_engram(r) for r in rows]
def get_modified_since(self, iso_ts: str, limit: int = 5000) -> List[Engram]:
"""Gibt Engramme zurück, deren `modified_at` nach `iso_ts` liegt."""
rows = self._conn.execute(
"SELECT * FROM engrams WHERE modified_at > ? ORDER BY modified_at ASC LIMIT ?",
(iso_ts, limit),
).fetchall()
return [self._row_to_engram(r) for r in rows]
def delete(self, engram_id: str) -> bool:
"""Löscht ein Engramm und alle Verknüpfungen."""
rowid = self._conn.execute(
@@ -239,6 +247,13 @@ class EngramStore:
"links": json.loads(row["links_json"]),
"hierarchy": json.loads(row["hierarchy_json"]),
}
# Keep Engram metadata timestamps aligned with DB columns so downstream
# consumers (e.g. vector indexing watermarks) can rely on them.
try:
d["metadata"]["created"] = row["created_at"]
d["metadata"]["modified"] = row["modified_at"]
except Exception:
pass
emb = row["embedding_json"]
if emb:
d["embedding"] = json.loads(emb)