Second-brain 2.0: hybrid retrieval, obsidian bridge, vector watermark, tests

2026-05-26 19:27:12 +02:00
parent 29bc45d623
commit e1640071e4
7 changed files with 291 additions and 16 deletions
--- a/README.md
+++ b/README.md
@@ -12,3 +12,7 @@ An embeddable, offline-first memory system for AI agents with correctness tracki
 - **Graph Visualization** (`src/graph_view.py`) — Interactive Cytoscape.js graph with confidence colors

 ## Architecture
+
+## Obsidian
+
+Setup and timers: `second-brain/docs/OBSIDIAN.md`
--- a/docs/OBSIDIAN.md
+++ b/docs/OBSIDIAN.md
@@ -0,0 +1,75 @@
+# Obsidian Coupling (Second-Brain 2.0)
+
+This integrates an Obsidian vault with Second-Brain via two cron tasks:
+
+- `cron_tasks/ingest_obsidian.py` (vault → Second-Brain)
+- `cron_tasks/export_obsidian.py` (Second-Brain → vault)
+
+All settings live in `second-brain/data/obsidian_config.json`.
+
+## 1) Install / Sync the vault to the server
+
+You need a local folder on the server that contains an Obsidian vault (it must contain a `.obsidian/` directory), e.g.:
+
+- `/srv/obsidian/MyVault`
+- `/data/obsidian/MyVault`
+- `/root/Obsidian/MyVault`
+
+How you sync it is up to you (Syncthing, rsync, SMB mount, etc.).
+
+## 2) Set `vault_path` in config (auto or manual)
+
+### Auto-discover (only writes if unambiguous)
+
+```bash
+python3 second-brain/scripts/discover_obsidian_vault.py
+python3 second-brain/scripts/discover_obsidian_vault.py --write
+```
+
+If multiple vaults are detected, it prints them and refuses to write.
+
+### Manual
+
+Edit `second-brain/data/obsidian_config.json` and set:
+
+- `vault_path` to the vault directory (the parent of `.obsidian/`)
+
+## 3) Enable ingest/export
+
+In `second-brain/data/obsidian_config.json`:
+
+- Set `enabled.ingest` to `true` to ingest vault markdown into Second-Brain
+- Set `enabled.export` to `true` to export Second-Brain engrams into the vault
+
+## 4) Enable timers (systemd)
+
+This repo ships unit files in `systemd/`:
+
+- `systemd/openclaw-secondbrain-ingest-obsidian.service`
+- `systemd/openclaw-secondbrain-ingest-obsidian.timer`
+- `systemd/openclaw-secondbrain-export-obsidian.service`
+- `systemd/openclaw-secondbrain-export-obsidian.timer`
+
+Install them (copy or symlink) to `/etc/systemd/system/`, then:
+
+```bash
+sudo systemctl daemon-reload
+sudo systemctl enable --now openclaw-secondbrain-ingest-obsidian.timer
+sudo systemctl enable --now openclaw-secondbrain-export-obsidian.timer
+```
+
+## 5) Verify
+
+Run once manually:
+
+```bash
+python3 openclaw_cron_wrapper.py ingest_obsidian
+python3 openclaw_cron_wrapper.py export_obsidian
+```
+
+What to expect:
+
+- If `vault_path` is missing/invalid, both tasks **skip** safely (no writes to random paths).
+- Ingest creates/updates `second-brain/data/obsidian_ingest_state.json`.
+- Export writes markdown files to `<vault_path>/<export.subdir>/` (default: `SecondBrain/`) and tracks state in `second-brain/data/obsidian_export_state.json`.
+
--- a/scripts/discover_obsidian_vault.py
+++ b/scripts/discover_obsidian_vault.py
@@ -0,0 +1,159 @@
+#!/usr/bin/env python3
+"""
+Auto-discover an Obsidian vault on this server and (optionally) write it into:
+  second-brain/data/obsidian_config.json
+
+Safety:
+- Only writes when exactly one vault is detected (unambiguous).
+- A "vault" is a directory that contains a `.obsidian/` folder.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import os
+from pathlib import Path
+from typing import Iterable
+
+
+WORKSPACE = Path("/root/.openclaw/workspace")
+BRAIN_DIR = WORKSPACE / "second-brain"
+CONFIG_PATH = BRAIN_DIR / "data" / "obsidian_config.json"
+
+
+def _iter_common_candidates() -> Iterable[Path]:
+    env = os.environ.get("OBSIDIAN_VAULT_PATH")
+    if env:
+        yield Path(env).expanduser()
+
+    home = Path.home()
+    for p in [
+        home / "Obsidian",
+        home / "ObsidianVault",
+        home / "Vault",
+        home / "Vaults",
+        home / "Documents" / "Obsidian",
+        home / "Documents" / "Vaults",
+        home / "Syncthing" / "Obsidian",
+        Path("/srv/obsidian"),
+        Path("/srv/Obsidian"),
+        Path("/data/obsidian"),
+        Path("/data/Obsidian"),
+        WORKSPACE / "obsidian",
+        WORKSPACE / "vault",
+        WORKSPACE / "vaults",
+    ]:
+        yield p
+
+
+def _is_vault_dir(p: Path) -> bool:
+    try:
+        return p.exists() and p.is_dir() and (p / ".obsidian").exists() and (p / ".obsidian").is_dir()
+    except Exception:
+        return False
+
+
+def _bounded_find_obsidian_dirs(root: Path, *, max_depth: int) -> list[Path]:
+    """
+    Find `.obsidian` directories below root, limited by depth to keep runtime bounded.
+    """
+    results: list[Path] = []
+    try:
+        root = root.resolve()
+    except Exception:
+        return results
+
+    if not root.exists() or not root.is_dir():
+        return results
+
+    def depth_of(path: Path) -> int:
+        try:
+            return len(path.relative_to(root).parts)
+        except Exception:
+            return 9999
+
+    # Breadth-first-ish scan with pruning
+    queue = [root]
+    while queue:
+        current = queue.pop(0)
+        if depth_of(current) > max_depth:
+            continue
+        try:
+            entries = list(current.iterdir())
+        except Exception:
+            continue
+
+        for e in entries:
+            name = e.name
+            if name in (".git", "node_modules", "__pycache__", ".cache", ".venv", "venv", "tmp", "proc", "sys", "dev"):
+                continue
+            if name.startswith(".") and name not in (".obsidian",):
+                continue
+            if name == ".obsidian" and e.is_dir():
+                results.append(e)
+                continue
+            if e.is_dir() and not e.is_symlink():
+                queue.append(e)
+
+    return results
+
+
+def discover(*, roots: list[Path], max_depth: int) -> list[Path]:
+    vaults: set[Path] = set()
+
+    for p in _iter_common_candidates():
+        if _is_vault_dir(p):
+            vaults.add(p.resolve())
+
+    for root in roots:
+        for obsidian_dir in _bounded_find_obsidian_dirs(root, max_depth=max_depth):
+            vaults.add(obsidian_dir.parent.resolve())
+
+    return sorted(vaults)
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description="Discover Obsidian vault and optionally write config")
+    ap.add_argument("--write", action="store_true", help="Write detected vault_path into obsidian_config.json")
+    ap.add_argument(
+        "--roots",
+        nargs="*",
+        default=[str(Path.home()), "/srv", "/data", "/mnt", str(WORKSPACE)],
+        help="Roots to scan (bounded). Default: home,/srv,/data,/mnt,workspace",
+    )
+    ap.add_argument("--max-depth", type=int, default=4, help="Max directory depth to scan under each root")
+    args = ap.parse_args()
+
+    roots = [Path(r).expanduser() for r in args.roots]
+    vaults = discover(roots=roots, max_depth=int(args.max_depth))
+
+    if not vaults:
+        print("No Obsidian vault found (no `.obsidian/` directories detected).")
+        return 1
+
+    if len(vaults) > 1:
+        print("Multiple Obsidian vaults found; refusing to write config:")
+        for v in vaults:
+            print(f"- {v}")
+        return 2
+
+    vault = vaults[0]
+    print(f"Detected Obsidian vault: {vault}")
+
+    if not args.write:
+        return 0
+
+    if not CONFIG_PATH.exists():
+        raise SystemExit(f"Missing config file: {CONFIG_PATH}")
+
+    cfg = json.loads(CONFIG_PATH.read_text(encoding="utf-8"))
+    cfg["vault_path"] = str(vault)
+    CONFIG_PATH.write_text(json.dumps(cfg, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
+    print(f"Wrote vault_path to: {CONFIG_PATH}")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
+
--- a/src/openclaw_bridge.py
+++ b/src/openclaw_bridge.py
@@ -38,6 +38,12 @@ try:
 except ImportError:
    Retriever = None

+# Chroma: optional (braucht chromadb)
+try:
+    from src.chroma_store import ChromaStore
+except Exception:
+    ChromaStore = None
+

 # --- Konfiguration ---
 BRAIN_DB = Path(__file__).parent.parent / "data" / "brain.sqlite"
@@ -207,10 +213,34 @@ def enrich_context(topic: str, limit: int = 3) -> str:
    """
    store = get_brain()

-    # Versuche Retriever (mit Embeddings), fallback auf einfache Textsuche
+    # Versuche Hybrid-Retrieval (FTS + optional Vector), fallback auf Textsuche
    if Retriever:
-        ret = Retriever(store)
-        results = ret.retrieve(topic, limit=limit, min_confidence=0.3)
+        chroma = None
+        if ChromaStore:
+            try:
+                chroma = ChromaStore(path=str(Path(__file__).parent.parent / "data" / "chroma"))
+            except Exception:
+                chroma = None
+        ret = Retriever(store, chroma=chroma)
+        try:
+            results = ret.hybrid_retrieve(topic, limit=limit * 3, min_confidence=0.3)
+        except Exception:
+            results = ret.retrieve(topic, limit=limit * 3, min_confidence=0.3)
+
+        # confirmed-first ranking
+        def _rank(r):
+            eg = r["engram"]
+            confirmed = 1 if getattr(eg.correctness, "confirmed", False) else 0
+            return (confirmed, float(r.get("score", 0.0)))
+
+        results.sort(key=_rank, reverse=True)
+
+        # If we have confirmed results, show only confirmed up to limit
+        confirmed_only = [r for r in results if r["engram"].correctness.confirmed]
+        if confirmed_only:
+            results = confirmed_only[:limit]
+        else:
+            results = results[:limit]
    else:
        results_raw = store.search_text(topic, limit=limit)
        results = [{"engram": eg, "score": 0.5} for eg in results_raw]
--- a/src/retriever.py
+++ b/src/retriever.py
@@ -7,12 +7,10 @@ Phase 2: + Embedding + Fusion.
 from typing import List, Dict, Any, Optional
 from .engram import Engram
 from .store import EngramStore
-from .chroma_store import ChromaStore
-from .embedder import encode


 class Retriever:
-    def __init__(self, store: EngramStore, chroma: Optional[ChromaStore] = None):
+    def __init__(self, store: EngramStore, chroma: Optional[object] = None):
        self.store = store
        self.chroma = chroma

@@ -50,7 +48,6 @@ class Retriever:
        if not self.chroma:
            return []
        chroma_results = self.chroma.query(query, top_k=limit * 3)
-        eids = [r["id"] for r in chroma_results]
        results = []
        for r in chroma_results:
            eg = self.store.get(r["id"])
--- a/src/store.py
+++ b/src/store.py
@@ -127,6 +127,14 @@ class EngramStore:
        ).fetchall()
        return [self._row_to_engram(r) for r in rows]

+    def get_modified_since(self, iso_ts: str, limit: int = 5000) -> List[Engram]:
+        """Gibt Engramme zurück, deren `modified_at` nach `iso_ts` liegt."""
+        rows = self._conn.execute(
+            "SELECT * FROM engrams WHERE modified_at > ? ORDER BY modified_at ASC LIMIT ?",
+            (iso_ts, limit),
+        ).fetchall()
+        return [self._row_to_engram(r) for r in rows]
+
    def delete(self, engram_id: str) -> bool:
        """Löscht ein Engramm und alle Verknüpfungen."""
        rowid = self._conn.execute(
@@ -239,6 +247,13 @@ class EngramStore:
            "links": json.loads(row["links_json"]),
            "hierarchy": json.loads(row["hierarchy_json"]),
        }
+        # Keep Engram metadata timestamps aligned with DB columns so downstream
+        # consumers (e.g. vector indexing watermarks) can rely on them.
+        try:
+            d["metadata"]["created"] = row["created_at"]
+            d["metadata"]["modified"] = row["modified_at"]
+        except Exception:
+            pass
        emb = row["embedding_json"]
        if emb:
            d["embedding"] = json.loads(emb)
--- a/tests/test_core.py
+++ b/tests/test_core.py
@@ -4,16 +4,11 @@
 import sys
 import os
 import tempfile
-sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "src"))
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))

-try:
 from src.engram import Engram, Grounding, Correctness
 from src.store import EngramStore
 from src.retriever import Retriever
-except ImportError:
-    from engram import Engram, Grounding, Correctness
-    from store import EngramStore
-    from retriever import Retriever


 def test_engram_creation():