Second-brain 2.0: hybrid retrieval, obsidian bridge, vector watermark, tests

This commit is contained in:
2026-05-26 19:27:12 +02:00
parent 29bc45d623
commit e1640071e4
7 changed files with 291 additions and 16 deletions

View File

@@ -12,3 +12,7 @@ An embeddable, offline-first memory system for AI agents with correctness tracki
- **Graph Visualization** (`src/graph_view.py`) — Interactive Cytoscape.js graph with confidence colors - **Graph Visualization** (`src/graph_view.py`) — Interactive Cytoscape.js graph with confidence colors
## Architecture ## Architecture
## Obsidian
Setup and timers: `second-brain/docs/OBSIDIAN.md`

75
docs/OBSIDIAN.md Normal file
View File

@@ -0,0 +1,75 @@
# Obsidian Coupling (Second-Brain 2.0)
This integrates an Obsidian vault with Second-Brain via two cron tasks:
- `cron_tasks/ingest_obsidian.py` (vault → Second-Brain)
- `cron_tasks/export_obsidian.py` (Second-Brain → vault)
All settings live in `second-brain/data/obsidian_config.json`.
## 1) Install / Sync the vault to the server
You need a local folder on the server that contains an Obsidian vault (it must contain a `.obsidian/` directory), e.g.:
- `/srv/obsidian/MyVault`
- `/data/obsidian/MyVault`
- `/root/Obsidian/MyVault`
How you sync it is up to you (Syncthing, rsync, SMB mount, etc.).
## 2) Set `vault_path` in config (auto or manual)
### Auto-discover (only writes if unambiguous)
```bash
python3 second-brain/scripts/discover_obsidian_vault.py
python3 second-brain/scripts/discover_obsidian_vault.py --write
```
If multiple vaults are detected, it prints them and refuses to write.
### Manual
Edit `second-brain/data/obsidian_config.json` and set:
- `vault_path` to the vault directory (the parent of `.obsidian/`)
## 3) Enable ingest/export
In `second-brain/data/obsidian_config.json`:
- Set `enabled.ingest` to `true` to ingest vault markdown into Second-Brain
- Set `enabled.export` to `true` to export Second-Brain engrams into the vault
## 4) Enable timers (systemd)
This repo ships unit files in `systemd/`:
- `systemd/openclaw-secondbrain-ingest-obsidian.service`
- `systemd/openclaw-secondbrain-ingest-obsidian.timer`
- `systemd/openclaw-secondbrain-export-obsidian.service`
- `systemd/openclaw-secondbrain-export-obsidian.timer`
Install them (copy or symlink) to `/etc/systemd/system/`, then:
```bash
sudo systemctl daemon-reload
sudo systemctl enable --now openclaw-secondbrain-ingest-obsidian.timer
sudo systemctl enable --now openclaw-secondbrain-export-obsidian.timer
```
## 5) Verify
Run once manually:
```bash
python3 openclaw_cron_wrapper.py ingest_obsidian
python3 openclaw_cron_wrapper.py export_obsidian
```
What to expect:
- If `vault_path` is missing/invalid, both tasks **skip** safely (no writes to random paths).
- Ingest creates/updates `second-brain/data/obsidian_ingest_state.json`.
- Export writes markdown files to `<vault_path>/<export.subdir>/` (default: `SecondBrain/`) and tracks state in `second-brain/data/obsidian_export_state.json`.

View File

@@ -0,0 +1,159 @@
#!/usr/bin/env python3
"""
Auto-discover an Obsidian vault on this server and (optionally) write it into:
second-brain/data/obsidian_config.json
Safety:
- Only writes when exactly one vault is detected (unambiguous).
- A "vault" is a directory that contains a `.obsidian/` folder.
"""
from __future__ import annotations
import argparse
import json
import os
from pathlib import Path
from typing import Iterable
WORKSPACE = Path("/root/.openclaw/workspace")
BRAIN_DIR = WORKSPACE / "second-brain"
CONFIG_PATH = BRAIN_DIR / "data" / "obsidian_config.json"
def _iter_common_candidates() -> Iterable[Path]:
env = os.environ.get("OBSIDIAN_VAULT_PATH")
if env:
yield Path(env).expanduser()
home = Path.home()
for p in [
home / "Obsidian",
home / "ObsidianVault",
home / "Vault",
home / "Vaults",
home / "Documents" / "Obsidian",
home / "Documents" / "Vaults",
home / "Syncthing" / "Obsidian",
Path("/srv/obsidian"),
Path("/srv/Obsidian"),
Path("/data/obsidian"),
Path("/data/Obsidian"),
WORKSPACE / "obsidian",
WORKSPACE / "vault",
WORKSPACE / "vaults",
]:
yield p
def _is_vault_dir(p: Path) -> bool:
try:
return p.exists() and p.is_dir() and (p / ".obsidian").exists() and (p / ".obsidian").is_dir()
except Exception:
return False
def _bounded_find_obsidian_dirs(root: Path, *, max_depth: int) -> list[Path]:
"""
Find `.obsidian` directories below root, limited by depth to keep runtime bounded.
"""
results: list[Path] = []
try:
root = root.resolve()
except Exception:
return results
if not root.exists() or not root.is_dir():
return results
def depth_of(path: Path) -> int:
try:
return len(path.relative_to(root).parts)
except Exception:
return 9999
# Breadth-first-ish scan with pruning
queue = [root]
while queue:
current = queue.pop(0)
if depth_of(current) > max_depth:
continue
try:
entries = list(current.iterdir())
except Exception:
continue
for e in entries:
name = e.name
if name in (".git", "node_modules", "__pycache__", ".cache", ".venv", "venv", "tmp", "proc", "sys", "dev"):
continue
if name.startswith(".") and name not in (".obsidian",):
continue
if name == ".obsidian" and e.is_dir():
results.append(e)
continue
if e.is_dir() and not e.is_symlink():
queue.append(e)
return results
def discover(*, roots: list[Path], max_depth: int) -> list[Path]:
vaults: set[Path] = set()
for p in _iter_common_candidates():
if _is_vault_dir(p):
vaults.add(p.resolve())
for root in roots:
for obsidian_dir in _bounded_find_obsidian_dirs(root, max_depth=max_depth):
vaults.add(obsidian_dir.parent.resolve())
return sorted(vaults)
def main() -> int:
ap = argparse.ArgumentParser(description="Discover Obsidian vault and optionally write config")
ap.add_argument("--write", action="store_true", help="Write detected vault_path into obsidian_config.json")
ap.add_argument(
"--roots",
nargs="*",
default=[str(Path.home()), "/srv", "/data", "/mnt", str(WORKSPACE)],
help="Roots to scan (bounded). Default: home,/srv,/data,/mnt,workspace",
)
ap.add_argument("--max-depth", type=int, default=4, help="Max directory depth to scan under each root")
args = ap.parse_args()
roots = [Path(r).expanduser() for r in args.roots]
vaults = discover(roots=roots, max_depth=int(args.max_depth))
if not vaults:
print("No Obsidian vault found (no `.obsidian/` directories detected).")
return 1
if len(vaults) > 1:
print("Multiple Obsidian vaults found; refusing to write config:")
for v in vaults:
print(f"- {v}")
return 2
vault = vaults[0]
print(f"Detected Obsidian vault: {vault}")
if not args.write:
return 0
if not CONFIG_PATH.exists():
raise SystemExit(f"Missing config file: {CONFIG_PATH}")
cfg = json.loads(CONFIG_PATH.read_text(encoding="utf-8"))
cfg["vault_path"] = str(vault)
CONFIG_PATH.write_text(json.dumps(cfg, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
print(f"Wrote vault_path to: {CONFIG_PATH}")
return 0
if __name__ == "__main__":
raise SystemExit(main())

View File

@@ -38,6 +38,12 @@ try:
except ImportError: except ImportError:
Retriever = None Retriever = None
# Chroma: optional (braucht chromadb)
try:
from src.chroma_store import ChromaStore
except Exception:
ChromaStore = None
# --- Konfiguration --- # --- Konfiguration ---
BRAIN_DB = Path(__file__).parent.parent / "data" / "brain.sqlite" BRAIN_DB = Path(__file__).parent.parent / "data" / "brain.sqlite"
@@ -207,10 +213,34 @@ def enrich_context(topic: str, limit: int = 3) -> str:
""" """
store = get_brain() store = get_brain()
# Versuche Retriever (mit Embeddings), fallback auf einfache Textsuche # Versuche Hybrid-Retrieval (FTS + optional Vector), fallback auf Textsuche
if Retriever: if Retriever:
ret = Retriever(store) chroma = None
results = ret.retrieve(topic, limit=limit, min_confidence=0.3) if ChromaStore:
try:
chroma = ChromaStore(path=str(Path(__file__).parent.parent / "data" / "chroma"))
except Exception:
chroma = None
ret = Retriever(store, chroma=chroma)
try:
results = ret.hybrid_retrieve(topic, limit=limit * 3, min_confidence=0.3)
except Exception:
results = ret.retrieve(topic, limit=limit * 3, min_confidence=0.3)
# confirmed-first ranking
def _rank(r):
eg = r["engram"]
confirmed = 1 if getattr(eg.correctness, "confirmed", False) else 0
return (confirmed, float(r.get("score", 0.0)))
results.sort(key=_rank, reverse=True)
# If we have confirmed results, show only confirmed up to limit
confirmed_only = [r for r in results if r["engram"].correctness.confirmed]
if confirmed_only:
results = confirmed_only[:limit]
else:
results = results[:limit]
else: else:
results_raw = store.search_text(topic, limit=limit) results_raw = store.search_text(topic, limit=limit)
results = [{"engram": eg, "score": 0.5} for eg in results_raw] results = [{"engram": eg, "score": 0.5} for eg in results_raw]

View File

@@ -7,12 +7,10 @@ Phase 2: + Embedding + Fusion.
from typing import List, Dict, Any, Optional from typing import List, Dict, Any, Optional
from .engram import Engram from .engram import Engram
from .store import EngramStore from .store import EngramStore
from .chroma_store import ChromaStore
from .embedder import encode
class Retriever: class Retriever:
def __init__(self, store: EngramStore, chroma: Optional[ChromaStore] = None): def __init__(self, store: EngramStore, chroma: Optional[object] = None):
self.store = store self.store = store
self.chroma = chroma self.chroma = chroma
@@ -50,7 +48,6 @@ class Retriever:
if not self.chroma: if not self.chroma:
return [] return []
chroma_results = self.chroma.query(query, top_k=limit * 3) chroma_results = self.chroma.query(query, top_k=limit * 3)
eids = [r["id"] for r in chroma_results]
results = [] results = []
for r in chroma_results: for r in chroma_results:
eg = self.store.get(r["id"]) eg = self.store.get(r["id"])

View File

@@ -127,6 +127,14 @@ class EngramStore:
).fetchall() ).fetchall()
return [self._row_to_engram(r) for r in rows] return [self._row_to_engram(r) for r in rows]
def get_modified_since(self, iso_ts: str, limit: int = 5000) -> List[Engram]:
"""Gibt Engramme zurück, deren `modified_at` nach `iso_ts` liegt."""
rows = self._conn.execute(
"SELECT * FROM engrams WHERE modified_at > ? ORDER BY modified_at ASC LIMIT ?",
(iso_ts, limit),
).fetchall()
return [self._row_to_engram(r) for r in rows]
def delete(self, engram_id: str) -> bool: def delete(self, engram_id: str) -> bool:
"""Löscht ein Engramm und alle Verknüpfungen.""" """Löscht ein Engramm und alle Verknüpfungen."""
rowid = self._conn.execute( rowid = self._conn.execute(
@@ -239,6 +247,13 @@ class EngramStore:
"links": json.loads(row["links_json"]), "links": json.loads(row["links_json"]),
"hierarchy": json.loads(row["hierarchy_json"]), "hierarchy": json.loads(row["hierarchy_json"]),
} }
# Keep Engram metadata timestamps aligned with DB columns so downstream
# consumers (e.g. vector indexing watermarks) can rely on them.
try:
d["metadata"]["created"] = row["created_at"]
d["metadata"]["modified"] = row["modified_at"]
except Exception:
pass
emb = row["embedding_json"] emb = row["embedding_json"]
if emb: if emb:
d["embedding"] = json.loads(emb) d["embedding"] = json.loads(emb)

View File

@@ -4,16 +4,11 @@
import sys import sys
import os import os
import tempfile import tempfile
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "src")) sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
try: from src.engram import Engram, Grounding, Correctness
from src.engram import Engram, Grounding, Correctness from src.store import EngramStore
from src.store import EngramStore from src.retriever import Retriever
from src.retriever import Retriever
except ImportError:
from engram import Engram, Grounding, Correctness
from store import EngramStore
from retriever import Retriever
def test_engram_creation(): def test_engram_creation():