Second-brain 2.0: hybrid retrieval, obsidian bridge, vector watermark, tests

2026-05-26 19:27:12 +02:00
parent 29bc45d623
commit e1640071e4
7 changed files with 291 additions and 16 deletions
--- a/src/openclaw_bridge.py
+++ b/src/openclaw_bridge.py
@@ -38,6 +38,12 @@ try:
 except ImportError:
    Retriever = None

+# Chroma: optional (braucht chromadb)
+try:
+    from src.chroma_store import ChromaStore
+except Exception:
+    ChromaStore = None
+

 # --- Konfiguration ---
 BRAIN_DB = Path(__file__).parent.parent / "data" / "brain.sqlite"
@@ -207,10 +213,34 @@ def enrich_context(topic: str, limit: int = 3) -> str:
    """
    store = get_brain()

-    # Versuche Retriever (mit Embeddings), fallback auf einfache Textsuche
+    # Versuche Hybrid-Retrieval (FTS + optional Vector), fallback auf Textsuche
    if Retriever:
-        ret = Retriever(store)
-        results = ret.retrieve(topic, limit=limit, min_confidence=0.3)
+        chroma = None
+        if ChromaStore:
+            try:
+                chroma = ChromaStore(path=str(Path(__file__).parent.parent / "data" / "chroma"))
+            except Exception:
+                chroma = None
+        ret = Retriever(store, chroma=chroma)
+        try:
+            results = ret.hybrid_retrieve(topic, limit=limit * 3, min_confidence=0.3)
+        except Exception:
+            results = ret.retrieve(topic, limit=limit * 3, min_confidence=0.3)
+
+        # confirmed-first ranking
+        def _rank(r):
+            eg = r["engram"]
+            confirmed = 1 if getattr(eg.correctness, "confirmed", False) else 0
+            return (confirmed, float(r.get("score", 0.0)))
+
+        results.sort(key=_rank, reverse=True)
+
+        # If we have confirmed results, show only confirmed up to limit
+        confirmed_only = [r for r in results if r["engram"].correctness.confirmed]
+        if confirmed_only:
+            results = confirmed_only[:limit]
+        else:
+            results = results[:limit]
    else:
        results_raw = store.search_text(topic, limit=limit)
        results = [{"engram": eg, "score": 0.5} for eg in results_raw]
--- a/src/retriever.py
+++ b/src/retriever.py
@@ -7,12 +7,10 @@ Phase 2: + Embedding + Fusion.
 from typing import List, Dict, Any, Optional
 from .engram import Engram
 from .store import EngramStore
-from .chroma_store import ChromaStore
-from .embedder import encode


 class Retriever:
-    def __init__(self, store: EngramStore, chroma: Optional[ChromaStore] = None):
+    def __init__(self, store: EngramStore, chroma: Optional[object] = None):
        self.store = store
        self.chroma = chroma

@@ -50,7 +48,6 @@ class Retriever:
        if not self.chroma:
            return []
        chroma_results = self.chroma.query(query, top_k=limit * 3)
-        eids = [r["id"] for r in chroma_results]
        results = []
        for r in chroma_results:
            eg = self.store.get(r["id"])
--- a/src/store.py
+++ b/src/store.py
@@ -127,6 +127,14 @@ class EngramStore:
        ).fetchall()
        return [self._row_to_engram(r) for r in rows]

+    def get_modified_since(self, iso_ts: str, limit: int = 5000) -> List[Engram]:
+        """Gibt Engramme zurück, deren `modified_at` nach `iso_ts` liegt."""
+        rows = self._conn.execute(
+            "SELECT * FROM engrams WHERE modified_at > ? ORDER BY modified_at ASC LIMIT ?",
+            (iso_ts, limit),
+        ).fetchall()
+        return [self._row_to_engram(r) for r in rows]
+
    def delete(self, engram_id: str) -> bool:
        """Löscht ein Engramm und alle Verknüpfungen."""
        rowid = self._conn.execute(
@@ -239,6 +247,13 @@ class EngramStore:
            "links": json.loads(row["links_json"]),
            "hierarchy": json.loads(row["hierarchy_json"]),
        }
+        # Keep Engram metadata timestamps aligned with DB columns so downstream
+        # consumers (e.g. vector indexing watermarks) can rely on them.
+        try:
+            d["metadata"]["created"] = row["created_at"]
+            d["metadata"]["modified"] = row["modified_at"]
+        except Exception:
+            pass
        emb = row["embedding_json"]
        if emb:
            d["embedding"] = json.loads(emb)