Second-brain 2.0: hybrid retrieval, obsidian bridge, vector watermark, tests
This commit is contained in:
@@ -12,3 +12,7 @@ An embeddable, offline-first memory system for AI agents with correctness tracki
|
||||
- **Graph Visualization** (`src/graph_view.py`) — Interactive Cytoscape.js graph with confidence colors
|
||||
|
||||
## Architecture
|
||||
|
||||
## Obsidian
|
||||
|
||||
Setup and timers: `second-brain/docs/OBSIDIAN.md`
|
||||
|
||||
75
docs/OBSIDIAN.md
Normal file
75
docs/OBSIDIAN.md
Normal file
@@ -0,0 +1,75 @@
|
||||
# Obsidian Coupling (Second-Brain 2.0)
|
||||
|
||||
This integrates an Obsidian vault with Second-Brain via two cron tasks:
|
||||
|
||||
- `cron_tasks/ingest_obsidian.py` (vault → Second-Brain)
|
||||
- `cron_tasks/export_obsidian.py` (Second-Brain → vault)
|
||||
|
||||
All settings live in `second-brain/data/obsidian_config.json`.
|
||||
|
||||
## 1) Install / Sync the vault to the server
|
||||
|
||||
You need a local folder on the server that contains an Obsidian vault (it must contain a `.obsidian/` directory), e.g.:
|
||||
|
||||
- `/srv/obsidian/MyVault`
|
||||
- `/data/obsidian/MyVault`
|
||||
- `/root/Obsidian/MyVault`
|
||||
|
||||
How you sync it is up to you (Syncthing, rsync, SMB mount, etc.).
|
||||
|
||||
## 2) Set `vault_path` in config (auto or manual)
|
||||
|
||||
### Auto-discover (only writes if unambiguous)
|
||||
|
||||
```bash
|
||||
python3 second-brain/scripts/discover_obsidian_vault.py
|
||||
python3 second-brain/scripts/discover_obsidian_vault.py --write
|
||||
```
|
||||
|
||||
If multiple vaults are detected, it prints them and refuses to write.
|
||||
|
||||
### Manual
|
||||
|
||||
Edit `second-brain/data/obsidian_config.json` and set:
|
||||
|
||||
- `vault_path` to the vault directory (the parent of `.obsidian/`)
|
||||
|
||||
## 3) Enable ingest/export
|
||||
|
||||
In `second-brain/data/obsidian_config.json`:
|
||||
|
||||
- Set `enabled.ingest` to `true` to ingest vault markdown into Second-Brain
|
||||
- Set `enabled.export` to `true` to export Second-Brain engrams into the vault
|
||||
|
||||
## 4) Enable timers (systemd)
|
||||
|
||||
This repo ships unit files in `systemd/`:
|
||||
|
||||
- `systemd/openclaw-secondbrain-ingest-obsidian.service`
|
||||
- `systemd/openclaw-secondbrain-ingest-obsidian.timer`
|
||||
- `systemd/openclaw-secondbrain-export-obsidian.service`
|
||||
- `systemd/openclaw-secondbrain-export-obsidian.timer`
|
||||
|
||||
Install them (copy or symlink) to `/etc/systemd/system/`, then:
|
||||
|
||||
```bash
|
||||
sudo systemctl daemon-reload
|
||||
sudo systemctl enable --now openclaw-secondbrain-ingest-obsidian.timer
|
||||
sudo systemctl enable --now openclaw-secondbrain-export-obsidian.timer
|
||||
```
|
||||
|
||||
## 5) Verify
|
||||
|
||||
Run once manually:
|
||||
|
||||
```bash
|
||||
python3 openclaw_cron_wrapper.py ingest_obsidian
|
||||
python3 openclaw_cron_wrapper.py export_obsidian
|
||||
```
|
||||
|
||||
What to expect:
|
||||
|
||||
- If `vault_path` is missing/invalid, both tasks **skip** safely (no writes to random paths).
|
||||
- Ingest creates/updates `second-brain/data/obsidian_ingest_state.json`.
|
||||
- Export writes markdown files to `<vault_path>/<export.subdir>/` (default: `SecondBrain/`) and tracks state in `second-brain/data/obsidian_export_state.json`.
|
||||
|
||||
159
scripts/discover_obsidian_vault.py
Normal file
159
scripts/discover_obsidian_vault.py
Normal file
@@ -0,0 +1,159 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Auto-discover an Obsidian vault on this server and (optionally) write it into:
|
||||
second-brain/data/obsidian_config.json
|
||||
|
||||
Safety:
|
||||
- Only writes when exactly one vault is detected (unambiguous).
|
||||
- A "vault" is a directory that contains a `.obsidian/` folder.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Iterable
|
||||
|
||||
|
||||
WORKSPACE = Path("/root/.openclaw/workspace")
|
||||
BRAIN_DIR = WORKSPACE / "second-brain"
|
||||
CONFIG_PATH = BRAIN_DIR / "data" / "obsidian_config.json"
|
||||
|
||||
|
||||
def _iter_common_candidates() -> Iterable[Path]:
|
||||
env = os.environ.get("OBSIDIAN_VAULT_PATH")
|
||||
if env:
|
||||
yield Path(env).expanduser()
|
||||
|
||||
home = Path.home()
|
||||
for p in [
|
||||
home / "Obsidian",
|
||||
home / "ObsidianVault",
|
||||
home / "Vault",
|
||||
home / "Vaults",
|
||||
home / "Documents" / "Obsidian",
|
||||
home / "Documents" / "Vaults",
|
||||
home / "Syncthing" / "Obsidian",
|
||||
Path("/srv/obsidian"),
|
||||
Path("/srv/Obsidian"),
|
||||
Path("/data/obsidian"),
|
||||
Path("/data/Obsidian"),
|
||||
WORKSPACE / "obsidian",
|
||||
WORKSPACE / "vault",
|
||||
WORKSPACE / "vaults",
|
||||
]:
|
||||
yield p
|
||||
|
||||
|
||||
def _is_vault_dir(p: Path) -> bool:
|
||||
try:
|
||||
return p.exists() and p.is_dir() and (p / ".obsidian").exists() and (p / ".obsidian").is_dir()
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def _bounded_find_obsidian_dirs(root: Path, *, max_depth: int) -> list[Path]:
|
||||
"""
|
||||
Find `.obsidian` directories below root, limited by depth to keep runtime bounded.
|
||||
"""
|
||||
results: list[Path] = []
|
||||
try:
|
||||
root = root.resolve()
|
||||
except Exception:
|
||||
return results
|
||||
|
||||
if not root.exists() or not root.is_dir():
|
||||
return results
|
||||
|
||||
def depth_of(path: Path) -> int:
|
||||
try:
|
||||
return len(path.relative_to(root).parts)
|
||||
except Exception:
|
||||
return 9999
|
||||
|
||||
# Breadth-first-ish scan with pruning
|
||||
queue = [root]
|
||||
while queue:
|
||||
current = queue.pop(0)
|
||||
if depth_of(current) > max_depth:
|
||||
continue
|
||||
try:
|
||||
entries = list(current.iterdir())
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
for e in entries:
|
||||
name = e.name
|
||||
if name in (".git", "node_modules", "__pycache__", ".cache", ".venv", "venv", "tmp", "proc", "sys", "dev"):
|
||||
continue
|
||||
if name.startswith(".") and name not in (".obsidian",):
|
||||
continue
|
||||
if name == ".obsidian" and e.is_dir():
|
||||
results.append(e)
|
||||
continue
|
||||
if e.is_dir() and not e.is_symlink():
|
||||
queue.append(e)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def discover(*, roots: list[Path], max_depth: int) -> list[Path]:
|
||||
vaults: set[Path] = set()
|
||||
|
||||
for p in _iter_common_candidates():
|
||||
if _is_vault_dir(p):
|
||||
vaults.add(p.resolve())
|
||||
|
||||
for root in roots:
|
||||
for obsidian_dir in _bounded_find_obsidian_dirs(root, max_depth=max_depth):
|
||||
vaults.add(obsidian_dir.parent.resolve())
|
||||
|
||||
return sorted(vaults)
|
||||
|
||||
|
||||
def main() -> int:
|
||||
ap = argparse.ArgumentParser(description="Discover Obsidian vault and optionally write config")
|
||||
ap.add_argument("--write", action="store_true", help="Write detected vault_path into obsidian_config.json")
|
||||
ap.add_argument(
|
||||
"--roots",
|
||||
nargs="*",
|
||||
default=[str(Path.home()), "/srv", "/data", "/mnt", str(WORKSPACE)],
|
||||
help="Roots to scan (bounded). Default: home,/srv,/data,/mnt,workspace",
|
||||
)
|
||||
ap.add_argument("--max-depth", type=int, default=4, help="Max directory depth to scan under each root")
|
||||
args = ap.parse_args()
|
||||
|
||||
roots = [Path(r).expanduser() for r in args.roots]
|
||||
vaults = discover(roots=roots, max_depth=int(args.max_depth))
|
||||
|
||||
if not vaults:
|
||||
print("No Obsidian vault found (no `.obsidian/` directories detected).")
|
||||
return 1
|
||||
|
||||
if len(vaults) > 1:
|
||||
print("Multiple Obsidian vaults found; refusing to write config:")
|
||||
for v in vaults:
|
||||
print(f"- {v}")
|
||||
return 2
|
||||
|
||||
vault = vaults[0]
|
||||
print(f"Detected Obsidian vault: {vault}")
|
||||
|
||||
if not args.write:
|
||||
return 0
|
||||
|
||||
if not CONFIG_PATH.exists():
|
||||
raise SystemExit(f"Missing config file: {CONFIG_PATH}")
|
||||
|
||||
cfg = json.loads(CONFIG_PATH.read_text(encoding="utf-8"))
|
||||
cfg["vault_path"] = str(vault)
|
||||
CONFIG_PATH.write_text(json.dumps(cfg, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
|
||||
print(f"Wrote vault_path to: {CONFIG_PATH}")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
|
||||
@@ -38,6 +38,12 @@ try:
|
||||
except ImportError:
|
||||
Retriever = None
|
||||
|
||||
# Chroma: optional (braucht chromadb)
|
||||
try:
|
||||
from src.chroma_store import ChromaStore
|
||||
except Exception:
|
||||
ChromaStore = None
|
||||
|
||||
|
||||
# --- Konfiguration ---
|
||||
BRAIN_DB = Path(__file__).parent.parent / "data" / "brain.sqlite"
|
||||
@@ -207,10 +213,34 @@ def enrich_context(topic: str, limit: int = 3) -> str:
|
||||
"""
|
||||
store = get_brain()
|
||||
|
||||
# Versuche Retriever (mit Embeddings), fallback auf einfache Textsuche
|
||||
# Versuche Hybrid-Retrieval (FTS + optional Vector), fallback auf Textsuche
|
||||
if Retriever:
|
||||
ret = Retriever(store)
|
||||
results = ret.retrieve(topic, limit=limit, min_confidence=0.3)
|
||||
chroma = None
|
||||
if ChromaStore:
|
||||
try:
|
||||
chroma = ChromaStore(path=str(Path(__file__).parent.parent / "data" / "chroma"))
|
||||
except Exception:
|
||||
chroma = None
|
||||
ret = Retriever(store, chroma=chroma)
|
||||
try:
|
||||
results = ret.hybrid_retrieve(topic, limit=limit * 3, min_confidence=0.3)
|
||||
except Exception:
|
||||
results = ret.retrieve(topic, limit=limit * 3, min_confidence=0.3)
|
||||
|
||||
# confirmed-first ranking
|
||||
def _rank(r):
|
||||
eg = r["engram"]
|
||||
confirmed = 1 if getattr(eg.correctness, "confirmed", False) else 0
|
||||
return (confirmed, float(r.get("score", 0.0)))
|
||||
|
||||
results.sort(key=_rank, reverse=True)
|
||||
|
||||
# If we have confirmed results, show only confirmed up to limit
|
||||
confirmed_only = [r for r in results if r["engram"].correctness.confirmed]
|
||||
if confirmed_only:
|
||||
results = confirmed_only[:limit]
|
||||
else:
|
||||
results = results[:limit]
|
||||
else:
|
||||
results_raw = store.search_text(topic, limit=limit)
|
||||
results = [{"engram": eg, "score": 0.5} for eg in results_raw]
|
||||
|
||||
@@ -7,12 +7,10 @@ Phase 2: + Embedding + Fusion.
|
||||
from typing import List, Dict, Any, Optional
|
||||
from .engram import Engram
|
||||
from .store import EngramStore
|
||||
from .chroma_store import ChromaStore
|
||||
from .embedder import encode
|
||||
|
||||
|
||||
class Retriever:
|
||||
def __init__(self, store: EngramStore, chroma: Optional[ChromaStore] = None):
|
||||
def __init__(self, store: EngramStore, chroma: Optional[object] = None):
|
||||
self.store = store
|
||||
self.chroma = chroma
|
||||
|
||||
@@ -50,7 +48,6 @@ class Retriever:
|
||||
if not self.chroma:
|
||||
return []
|
||||
chroma_results = self.chroma.query(query, top_k=limit * 3)
|
||||
eids = [r["id"] for r in chroma_results]
|
||||
results = []
|
||||
for r in chroma_results:
|
||||
eg = self.store.get(r["id"])
|
||||
|
||||
15
src/store.py
15
src/store.py
@@ -127,6 +127,14 @@ class EngramStore:
|
||||
).fetchall()
|
||||
return [self._row_to_engram(r) for r in rows]
|
||||
|
||||
def get_modified_since(self, iso_ts: str, limit: int = 5000) -> List[Engram]:
|
||||
"""Gibt Engramme zurück, deren `modified_at` nach `iso_ts` liegt."""
|
||||
rows = self._conn.execute(
|
||||
"SELECT * FROM engrams WHERE modified_at > ? ORDER BY modified_at ASC LIMIT ?",
|
||||
(iso_ts, limit),
|
||||
).fetchall()
|
||||
return [self._row_to_engram(r) for r in rows]
|
||||
|
||||
def delete(self, engram_id: str) -> bool:
|
||||
"""Löscht ein Engramm und alle Verknüpfungen."""
|
||||
rowid = self._conn.execute(
|
||||
@@ -239,6 +247,13 @@ class EngramStore:
|
||||
"links": json.loads(row["links_json"]),
|
||||
"hierarchy": json.loads(row["hierarchy_json"]),
|
||||
}
|
||||
# Keep Engram metadata timestamps aligned with DB columns so downstream
|
||||
# consumers (e.g. vector indexing watermarks) can rely on them.
|
||||
try:
|
||||
d["metadata"]["created"] = row["created_at"]
|
||||
d["metadata"]["modified"] = row["modified_at"]
|
||||
except Exception:
|
||||
pass
|
||||
emb = row["embedding_json"]
|
||||
if emb:
|
||||
d["embedding"] = json.loads(emb)
|
||||
|
||||
@@ -4,16 +4,11 @@
|
||||
import sys
|
||||
import os
|
||||
import tempfile
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "src"))
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
|
||||
|
||||
try:
|
||||
from src.engram import Engram, Grounding, Correctness
|
||||
from src.store import EngramStore
|
||||
from src.retriever import Retriever
|
||||
except ImportError:
|
||||
from engram import Engram, Grounding, Correctness
|
||||
from store import EngramStore
|
||||
from retriever import Retriever
|
||||
|
||||
|
||||
def test_engram_creation():
|
||||
|
||||
Reference in New Issue
Block a user