Second-brain 2.0: hybrid retrieval, obsidian bridge, vector watermark, tests

2026-05-26 19:27:12 +02:00
parent 29bc45d623
commit e1640071e4
7 changed files with 291 additions and 16 deletions
--- a/README.md
+++ b/README.md
@@ -12,3 +12,7 @@ An embeddable, offline-first memory system for AI agents with correctness tracki
 - **Graph Visualization** (`src/graph_view.py`) — Interactive Cytoscape.js graph with confidence colors
 ## Architecture
 ## Obsidian
 Setup and timers: `second-brain/docs/OBSIDIAN.md`
--- a/docs/OBSIDIAN.md
+++ b/docs/OBSIDIAN.md
@@ -0,0 +1,75 @@
 # Obsidian Coupling (Second-Brain 2.0)
 This integrates an Obsidian vault with Second-Brain via two cron tasks:
 - `cron_tasks/ingest_obsidian.py` (vault → Second-Brain)
 - `cron_tasks/export_obsidian.py` (Second-Brain → vault)
 All settings live in `second-brain/data/obsidian_config.json`.
 ## 1) Install / Sync the vault to the server
 You need a local folder on the server that contains an Obsidian vault (it must contain a `.obsidian/` directory), e.g.:
 - `/srv/obsidian/MyVault`
 - `/data/obsidian/MyVault`
 - `/root/Obsidian/MyVault`
 How you sync it is up to you (Syncthing, rsync, SMB mount, etc.).
 ## 2) Set `vault_path` in config (auto or manual)
 ### Auto-discover (only writes if unambiguous)
 ```bash
 python3 second-brain/scripts/discover_obsidian_vault.py
 python3 second-brain/scripts/discover_obsidian_vault.py --write
 ```
 If multiple vaults are detected, it prints them and refuses to write.
 ### Manual
 Edit `second-brain/data/obsidian_config.json` and set:
 - `vault_path` to the vault directory (the parent of `.obsidian/`)
 ## 3) Enable ingest/export
 In `second-brain/data/obsidian_config.json`:
 - Set `enabled.ingest` to `true` to ingest vault markdown into Second-Brain
 - Set `enabled.export` to `true` to export Second-Brain engrams into the vault
 ## 4) Enable timers (systemd)
 This repo ships unit files in `systemd/`:
 - `systemd/openclaw-secondbrain-ingest-obsidian.service`
 - `systemd/openclaw-secondbrain-ingest-obsidian.timer`
 - `systemd/openclaw-secondbrain-export-obsidian.service`
 - `systemd/openclaw-secondbrain-export-obsidian.timer`
 Install them (copy or symlink) to `/etc/systemd/system/`, then:
 ```bash
 sudo systemctl daemon-reload
 sudo systemctl enable --now openclaw-secondbrain-ingest-obsidian.timer
 sudo systemctl enable --now openclaw-secondbrain-export-obsidian.timer
 ```
 ## 5) Verify
 Run once manually:
 ```bash
 python3 openclaw_cron_wrapper.py ingest_obsidian
 python3 openclaw_cron_wrapper.py export_obsidian
 ```
 What to expect:
 - If `vault_path` is missing/invalid, both tasks **skip** safely (no writes to random paths).
 - Ingest creates/updates `second-brain/data/obsidian_ingest_state.json`.
 - Export writes markdown files to `<vault_path>/<export.subdir>/` (default: `SecondBrain/`) and tracks state in `second-brain/data/obsidian_export_state.json`.
--- a/scripts/discover_obsidian_vault.py
+++ b/scripts/discover_obsidian_vault.py
@@ -0,0 +1,159 @@
 #!/usr/bin/env python3
 """
 Auto-discover an Obsidian vault on this server and (optionally) write it into:
  second-brain/data/obsidian_config.json
 Safety:
 - Only writes when exactly one vault is detected (unambiguous).
 - A "vault" is a directory that contains a `.obsidian/` folder.
 """
 from __future__ import annotations
 import argparse
 import json
 import os
 from pathlib import Path
 from typing import Iterable
 WORKSPACE = Path("/root/.openclaw/workspace")
 BRAIN_DIR = WORKSPACE / "second-brain"
 CONFIG_PATH = BRAIN_DIR / "data" / "obsidian_config.json"
 def _iter_common_candidates() -> Iterable[Path]:
    env = os.environ.get("OBSIDIAN_VAULT_PATH")
    if env:
        yield Path(env).expanduser()
    home = Path.home()
    for p in [
        home / "Obsidian",
        home / "ObsidianVault",
        home / "Vault",
        home / "Vaults",
        home / "Documents" / "Obsidian",
        home / "Documents" / "Vaults",
        home / "Syncthing" / "Obsidian",
        Path("/srv/obsidian"),
        Path("/srv/Obsidian"),
        Path("/data/obsidian"),
        Path("/data/Obsidian"),
        WORKSPACE / "obsidian",
        WORKSPACE / "vault",
        WORKSPACE / "vaults",
    ]:
        yield p
 def _is_vault_dir(p: Path) -> bool:
    try:
        return p.exists() and p.is_dir() and (p / ".obsidian").exists() and (p / ".obsidian").is_dir()
    except Exception:
        return False
 def _bounded_find_obsidian_dirs(root: Path, *, max_depth: int) -> list[Path]:
    """
    Find `.obsidian` directories below root, limited by depth to keep runtime bounded.
    """
    results: list[Path] = []
    try:
        root = root.resolve()
    except Exception:
        return results
    if not root.exists() or not root.is_dir():
        return results
    def depth_of(path: Path) -> int:
        try:
            return len(path.relative_to(root).parts)
        except Exception:
            return 9999
    # Breadth-first-ish scan with pruning
    queue = [root]
    while queue:
        current = queue.pop(0)
        if depth_of(current) > max_depth:
            continue
        try:
            entries = list(current.iterdir())
        except Exception:
            continue
        for e in entries:
            name = e.name
            if name in (".git", "node_modules", "__pycache__", ".cache", ".venv", "venv", "tmp", "proc", "sys", "dev"):
                continue
            if name.startswith(".") and name not in (".obsidian",):
                continue
            if name == ".obsidian" and e.is_dir():
                results.append(e)
                continue
            if e.is_dir() and not e.is_symlink():
                queue.append(e)
    return results
 def discover(*, roots: list[Path], max_depth: int) -> list[Path]:
    vaults: set[Path] = set()
    for p in _iter_common_candidates():
        if _is_vault_dir(p):
            vaults.add(p.resolve())
    for root in roots:
        for obsidian_dir in _bounded_find_obsidian_dirs(root, max_depth=max_depth):
            vaults.add(obsidian_dir.parent.resolve())
    return sorted(vaults)
 def main() -> int:
    ap = argparse.ArgumentParser(description="Discover Obsidian vault and optionally write config")
    ap.add_argument("--write", action="store_true", help="Write detected vault_path into obsidian_config.json")
    ap.add_argument(
        "--roots",
        nargs="*",
        default=[str(Path.home()), "/srv", "/data", "/mnt", str(WORKSPACE)],
        help="Roots to scan (bounded). Default: home,/srv,/data,/mnt,workspace",
    )
    ap.add_argument("--max-depth", type=int, default=4, help="Max directory depth to scan under each root")
    args = ap.parse_args()
    roots = [Path(r).expanduser() for r in args.roots]
    vaults = discover(roots=roots, max_depth=int(args.max_depth))
    if not vaults:
        print("No Obsidian vault found (no `.obsidian/` directories detected).")
        return 1
    if len(vaults) > 1:
        print("Multiple Obsidian vaults found; refusing to write config:")
        for v in vaults:
            print(f"- {v}")
        return 2
    vault = vaults[0]
    print(f"Detected Obsidian vault: {vault}")
    if not args.write:
        return 0
    if not CONFIG_PATH.exists():
        raise SystemExit(f"Missing config file: {CONFIG_PATH}")
    cfg = json.loads(CONFIG_PATH.read_text(encoding="utf-8"))
    cfg["vault_path"] = str(vault)
    CONFIG_PATH.write_text(json.dumps(cfg, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
    print(f"Wrote vault_path to: {CONFIG_PATH}")
    return 0
 if __name__ == "__main__":
    raise SystemExit(main())
--- a/src/openclaw_bridge.py
+++ b/src/openclaw_bridge.py
@@ -38,6 +38,12 @@ try:
 except ImportError:
    Retriever = None
 # Chroma: optional (braucht chromadb)
 try:
    from src.chroma_store import ChromaStore
 except Exception:
    ChromaStore = None
 # --- Konfiguration ---
 BRAIN_DB = Path(__file__).parent.parent / "data" / "brain.sqlite"
@@ -207,10 +213,34 @@ def enrich_context(topic: str, limit: int = 3) -> str:
    """
    store = get_brain()
-    # Versuche Retriever (mit Embeddings), fallback auf einfache Textsuche
+    # Versuche Hybrid-Retrieval (FTS + optional Vector), fallback auf Textsuche
    if Retriever:
-        ret = Retriever(store)
+        chroma = None
-        results = ret.retrieve(topic, limit=limit, min_confidence=0.3)
+        if ChromaStore:
            try:
                chroma = ChromaStore(path=str(Path(__file__).parent.parent / "data" / "chroma"))
            except Exception:
                chroma = None
        ret = Retriever(store, chroma=chroma)
        try:
            results = ret.hybrid_retrieve(topic, limit=limit * 3, min_confidence=0.3)
        except Exception:
            results = ret.retrieve(topic, limit=limit * 3, min_confidence=0.3)
        # confirmed-first ranking
        def _rank(r):
            eg = r["engram"]
            confirmed = 1 if getattr(eg.correctness, "confirmed", False) else 0
            return (confirmed, float(r.get("score", 0.0)))
        results.sort(key=_rank, reverse=True)
        # If we have confirmed results, show only confirmed up to limit
        confirmed_only = [r for r in results if r["engram"].correctness.confirmed]
        if confirmed_only:
            results = confirmed_only[:limit]
        else:
            results = results[:limit]
    else:
        results_raw = store.search_text(topic, limit=limit)
        results = [{"engram": eg, "score": 0.5} for eg in results_raw]
--- a/src/retriever.py
+++ b/src/retriever.py
@@ -7,12 +7,10 @@ Phase 2: + Embedding + Fusion.
 from typing import List, Dict, Any, Optional
 from .engram import Engram
 from .store import EngramStore
 from .chroma_store import ChromaStore
 from .embedder import encode
 class Retriever:
-    def __init__(self, store: EngramStore, chroma: Optional[ChromaStore] = None):
+    def __init__(self, store: EngramStore, chroma: Optional[object] = None):
        self.store = store
        self.chroma = chroma
@@ -50,7 +48,6 @@ class Retriever:
        if not self.chroma:
            return []
        chroma_results = self.chroma.query(query, top_k=limit * 3)
        eids = [r["id"] for r in chroma_results]
        results = []
        for r in chroma_results:
            eg = self.store.get(r["id"])
--- a/src/store.py
+++ b/src/store.py
@@ -127,6 +127,14 @@ class EngramStore:
        ).fetchall()
        return [self._row_to_engram(r) for r in rows]
    def get_modified_since(self, iso_ts: str, limit: int = 5000) -> List[Engram]:
        """Gibt Engramme zurück, deren `modified_at` nach `iso_ts` liegt."""
        rows = self._conn.execute(
            "SELECT * FROM engrams WHERE modified_at > ? ORDER BY modified_at ASC LIMIT ?",
            (iso_ts, limit),
        ).fetchall()
        return [self._row_to_engram(r) for r in rows]
    def delete(self, engram_id: str) -> bool:
        """Löscht ein Engramm und alle Verknüpfungen."""
        rowid = self._conn.execute(
@@ -239,6 +247,13 @@ class EngramStore:
            "links": json.loads(row["links_json"]),
            "hierarchy": json.loads(row["hierarchy_json"]),
        }
        # Keep Engram metadata timestamps aligned with DB columns so downstream
        # consumers (e.g. vector indexing watermarks) can rely on them.
        try:
            d["metadata"]["created"] = row["created_at"]
            d["metadata"]["modified"] = row["modified_at"]
        except Exception:
            pass
        emb = row["embedding_json"]
        if emb:
            d["embedding"] = json.loads(emb)
--- a/tests/test_core.py
+++ b/tests/test_core.py
@@ -4,16 +4,11 @@
 import sys
 import os
 import tempfile
-sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "src"))
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
 try:
 from src.engram import Engram, Grounding, Correctness
 from src.store import EngramStore
 from src.retriever import Retriever
 except ImportError:
    from engram import Engram, Grounding, Correctness
    from store import EngramStore
    from retriever import Retriever
 def test_engram_creation():