Second-brain 2.0: hybrid retrieval, obsidian bridge, vector watermark, tests

This commit is contained in:
2026-05-26 19:27:12 +02:00
parent 29bc45d623
commit e1640071e4
7 changed files with 291 additions and 16 deletions

View File

@@ -12,3 +12,7 @@ An embeddable, offline-first memory system for AI agents with correctness tracki
- **Graph Visualization** (`src/graph_view.py`) — Interactive Cytoscape.js graph with confidence colors
## Architecture
## Obsidian
Setup and timers: `second-brain/docs/OBSIDIAN.md`

75
docs/OBSIDIAN.md Normal file
View File

@@ -0,0 +1,75 @@
# Obsidian Coupling (Second-Brain 2.0)
This integrates an Obsidian vault with Second-Brain via two cron tasks:
- `cron_tasks/ingest_obsidian.py` (vault → Second-Brain)
- `cron_tasks/export_obsidian.py` (Second-Brain → vault)
All settings live in `second-brain/data/obsidian_config.json`.
## 1) Install / Sync the vault to the server
You need a local folder on the server that contains an Obsidian vault (it must contain a `.obsidian/` directory), e.g.:
- `/srv/obsidian/MyVault`
- `/data/obsidian/MyVault`
- `/root/Obsidian/MyVault`
How you sync it is up to you (Syncthing, rsync, SMB mount, etc.).
## 2) Set `vault_path` in config (auto or manual)
### Auto-discover (only writes if unambiguous)
```bash
python3 second-brain/scripts/discover_obsidian_vault.py
python3 second-brain/scripts/discover_obsidian_vault.py --write
```
If multiple vaults are detected, it prints them and refuses to write.
### Manual
Edit `second-brain/data/obsidian_config.json` and set:
- `vault_path` to the vault directory (the parent of `.obsidian/`)
## 3) Enable ingest/export
In `second-brain/data/obsidian_config.json`:
- Set `enabled.ingest` to `true` to ingest vault markdown into Second-Brain
- Set `enabled.export` to `true` to export Second-Brain engrams into the vault
## 4) Enable timers (systemd)
This repo ships unit files in `systemd/`:
- `systemd/openclaw-secondbrain-ingest-obsidian.service`
- `systemd/openclaw-secondbrain-ingest-obsidian.timer`
- `systemd/openclaw-secondbrain-export-obsidian.service`
- `systemd/openclaw-secondbrain-export-obsidian.timer`
Install them (copy or symlink) to `/etc/systemd/system/`, then:
```bash
sudo systemctl daemon-reload
sudo systemctl enable --now openclaw-secondbrain-ingest-obsidian.timer
sudo systemctl enable --now openclaw-secondbrain-export-obsidian.timer
```
## 5) Verify
Run once manually:
```bash
python3 openclaw_cron_wrapper.py ingest_obsidian
python3 openclaw_cron_wrapper.py export_obsidian
```
What to expect:
- If `vault_path` is missing/invalid, both tasks **skip** safely (no writes to random paths).
- Ingest creates/updates `second-brain/data/obsidian_ingest_state.json`.
- Export writes markdown files to `<vault_path>/<export.subdir>/` (default: `SecondBrain/`) and tracks state in `second-brain/data/obsidian_export_state.json`.

View File

@@ -0,0 +1,159 @@
#!/usr/bin/env python3
"""
Auto-discover an Obsidian vault on this server and (optionally) write it into:
second-brain/data/obsidian_config.json
Safety:
- Only writes when exactly one vault is detected (unambiguous).
- A "vault" is a directory that contains a `.obsidian/` folder.
"""
from __future__ import annotations
import argparse
import json
import os
from pathlib import Path
from typing import Iterable
WORKSPACE = Path("/root/.openclaw/workspace")
BRAIN_DIR = WORKSPACE / "second-brain"
CONFIG_PATH = BRAIN_DIR / "data" / "obsidian_config.json"
def _iter_common_candidates() -> Iterable[Path]:
env = os.environ.get("OBSIDIAN_VAULT_PATH")
if env:
yield Path(env).expanduser()
home = Path.home()
for p in [
home / "Obsidian",
home / "ObsidianVault",
home / "Vault",
home / "Vaults",
home / "Documents" / "Obsidian",
home / "Documents" / "Vaults",
home / "Syncthing" / "Obsidian",
Path("/srv/obsidian"),
Path("/srv/Obsidian"),
Path("/data/obsidian"),
Path("/data/Obsidian"),
WORKSPACE / "obsidian",
WORKSPACE / "vault",
WORKSPACE / "vaults",
]:
yield p
def _is_vault_dir(p: Path) -> bool:
try:
return p.exists() and p.is_dir() and (p / ".obsidian").exists() and (p / ".obsidian").is_dir()
except Exception:
return False
def _bounded_find_obsidian_dirs(root: Path, *, max_depth: int) -> list[Path]:
"""
Find `.obsidian` directories below root, limited by depth to keep runtime bounded.
"""
results: list[Path] = []
try:
root = root.resolve()
except Exception:
return results
if not root.exists() or not root.is_dir():
return results
def depth_of(path: Path) -> int:
try:
return len(path.relative_to(root).parts)
except Exception:
return 9999
# Breadth-first-ish scan with pruning
queue = [root]
while queue:
current = queue.pop(0)
if depth_of(current) > max_depth:
continue
try:
entries = list(current.iterdir())
except Exception:
continue
for e in entries:
name = e.name
if name in (".git", "node_modules", "__pycache__", ".cache", ".venv", "venv", "tmp", "proc", "sys", "dev"):
continue
if name.startswith(".") and name not in (".obsidian",):
continue
if name == ".obsidian" and e.is_dir():
results.append(e)
continue
if e.is_dir() and not e.is_symlink():
queue.append(e)
return results
def discover(*, roots: list[Path], max_depth: int) -> list[Path]:
vaults: set[Path] = set()
for p in _iter_common_candidates():
if _is_vault_dir(p):
vaults.add(p.resolve())
for root in roots:
for obsidian_dir in _bounded_find_obsidian_dirs(root, max_depth=max_depth):
vaults.add(obsidian_dir.parent.resolve())
return sorted(vaults)
def main() -> int:
ap = argparse.ArgumentParser(description="Discover Obsidian vault and optionally write config")
ap.add_argument("--write", action="store_true", help="Write detected vault_path into obsidian_config.json")
ap.add_argument(
"--roots",
nargs="*",
default=[str(Path.home()), "/srv", "/data", "/mnt", str(WORKSPACE)],
help="Roots to scan (bounded). Default: home,/srv,/data,/mnt,workspace",
)
ap.add_argument("--max-depth", type=int, default=4, help="Max directory depth to scan under each root")
args = ap.parse_args()
roots = [Path(r).expanduser() for r in args.roots]
vaults = discover(roots=roots, max_depth=int(args.max_depth))
if not vaults:
print("No Obsidian vault found (no `.obsidian/` directories detected).")
return 1
if len(vaults) > 1:
print("Multiple Obsidian vaults found; refusing to write config:")
for v in vaults:
print(f"- {v}")
return 2
vault = vaults[0]
print(f"Detected Obsidian vault: {vault}")
if not args.write:
return 0
if not CONFIG_PATH.exists():
raise SystemExit(f"Missing config file: {CONFIG_PATH}")
cfg = json.loads(CONFIG_PATH.read_text(encoding="utf-8"))
cfg["vault_path"] = str(vault)
CONFIG_PATH.write_text(json.dumps(cfg, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
print(f"Wrote vault_path to: {CONFIG_PATH}")
return 0
if __name__ == "__main__":
raise SystemExit(main())

View File

@@ -38,6 +38,12 @@ try:
except ImportError:
Retriever = None
# Chroma: optional (braucht chromadb)
try:
from src.chroma_store import ChromaStore
except Exception:
ChromaStore = None
# --- Konfiguration ---
BRAIN_DB = Path(__file__).parent.parent / "data" / "brain.sqlite"
@@ -207,10 +213,34 @@ def enrich_context(topic: str, limit: int = 3) -> str:
"""
store = get_brain()
# Versuche Retriever (mit Embeddings), fallback auf einfache Textsuche
# Versuche Hybrid-Retrieval (FTS + optional Vector), fallback auf Textsuche
if Retriever:
ret = Retriever(store)
results = ret.retrieve(topic, limit=limit, min_confidence=0.3)
chroma = None
if ChromaStore:
try:
chroma = ChromaStore(path=str(Path(__file__).parent.parent / "data" / "chroma"))
except Exception:
chroma = None
ret = Retriever(store, chroma=chroma)
try:
results = ret.hybrid_retrieve(topic, limit=limit * 3, min_confidence=0.3)
except Exception:
results = ret.retrieve(topic, limit=limit * 3, min_confidence=0.3)
# confirmed-first ranking
def _rank(r):
eg = r["engram"]
confirmed = 1 if getattr(eg.correctness, "confirmed", False) else 0
return (confirmed, float(r.get("score", 0.0)))
results.sort(key=_rank, reverse=True)
# If we have confirmed results, show only confirmed up to limit
confirmed_only = [r for r in results if r["engram"].correctness.confirmed]
if confirmed_only:
results = confirmed_only[:limit]
else:
results = results[:limit]
else:
results_raw = store.search_text(topic, limit=limit)
results = [{"engram": eg, "score": 0.5} for eg in results_raw]

View File

@@ -7,12 +7,10 @@ Phase 2: + Embedding + Fusion.
from typing import List, Dict, Any, Optional
from .engram import Engram
from .store import EngramStore
from .chroma_store import ChromaStore
from .embedder import encode
class Retriever:
def __init__(self, store: EngramStore, chroma: Optional[ChromaStore] = None):
def __init__(self, store: EngramStore, chroma: Optional[object] = None):
self.store = store
self.chroma = chroma
@@ -50,7 +48,6 @@ class Retriever:
if not self.chroma:
return []
chroma_results = self.chroma.query(query, top_k=limit * 3)
eids = [r["id"] for r in chroma_results]
results = []
for r in chroma_results:
eg = self.store.get(r["id"])

View File

@@ -127,6 +127,14 @@ class EngramStore:
).fetchall()
return [self._row_to_engram(r) for r in rows]
def get_modified_since(self, iso_ts: str, limit: int = 5000) -> List[Engram]:
"""Gibt Engramme zurück, deren `modified_at` nach `iso_ts` liegt."""
rows = self._conn.execute(
"SELECT * FROM engrams WHERE modified_at > ? ORDER BY modified_at ASC LIMIT ?",
(iso_ts, limit),
).fetchall()
return [self._row_to_engram(r) for r in rows]
def delete(self, engram_id: str) -> bool:
"""Löscht ein Engramm und alle Verknüpfungen."""
rowid = self._conn.execute(
@@ -239,6 +247,13 @@ class EngramStore:
"links": json.loads(row["links_json"]),
"hierarchy": json.loads(row["hierarchy_json"]),
}
# Keep Engram metadata timestamps aligned with DB columns so downstream
# consumers (e.g. vector indexing watermarks) can rely on them.
try:
d["metadata"]["created"] = row["created_at"]
d["metadata"]["modified"] = row["modified_at"]
except Exception:
pass
emb = row["embedding_json"]
if emb:
d["embedding"] = json.loads(emb)

View File

@@ -4,16 +4,11 @@
import sys
import os
import tempfile
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "src"))
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
try:
from src.engram import Engram, Grounding, Correctness
from src.store import EngramStore
from src.retriever import Retriever
except ImportError:
from engram import Engram, Grounding, Correctness
from store import EngramStore
from retriever import Retriever
from src.engram import Engram, Grounding, Correctness
from src.store import EngramStore
from src.retriever import Retriever
def test_engram_creation():