Second-brain 2.0: hybrid retrieval, obsidian bridge, vector watermark, tests
This commit is contained in:
@@ -12,3 +12,7 @@ An embeddable, offline-first memory system for AI agents with correctness tracki
|
|||||||
- **Graph Visualization** (`src/graph_view.py`) — Interactive Cytoscape.js graph with confidence colors
|
- **Graph Visualization** (`src/graph_view.py`) — Interactive Cytoscape.js graph with confidence colors
|
||||||
|
|
||||||
## Architecture
|
## Architecture
|
||||||
|
|
||||||
|
## Obsidian
|
||||||
|
|
||||||
|
Setup and timers: `second-brain/docs/OBSIDIAN.md`
|
||||||
|
|||||||
75
docs/OBSIDIAN.md
Normal file
75
docs/OBSIDIAN.md
Normal file
@@ -0,0 +1,75 @@
|
|||||||
|
# Obsidian Coupling (Second-Brain 2.0)
|
||||||
|
|
||||||
|
This integrates an Obsidian vault with Second-Brain via two cron tasks:
|
||||||
|
|
||||||
|
- `cron_tasks/ingest_obsidian.py` (vault → Second-Brain)
|
||||||
|
- `cron_tasks/export_obsidian.py` (Second-Brain → vault)
|
||||||
|
|
||||||
|
All settings live in `second-brain/data/obsidian_config.json`.
|
||||||
|
|
||||||
|
## 1) Install / Sync the vault to the server
|
||||||
|
|
||||||
|
You need a local folder on the server that contains an Obsidian vault (it must contain a `.obsidian/` directory), e.g.:
|
||||||
|
|
||||||
|
- `/srv/obsidian/MyVault`
|
||||||
|
- `/data/obsidian/MyVault`
|
||||||
|
- `/root/Obsidian/MyVault`
|
||||||
|
|
||||||
|
How you sync it is up to you (Syncthing, rsync, SMB mount, etc.).
|
||||||
|
|
||||||
|
## 2) Set `vault_path` in config (auto or manual)
|
||||||
|
|
||||||
|
### Auto-discover (only writes if unambiguous)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3 second-brain/scripts/discover_obsidian_vault.py
|
||||||
|
python3 second-brain/scripts/discover_obsidian_vault.py --write
|
||||||
|
```
|
||||||
|
|
||||||
|
If multiple vaults are detected, it prints them and refuses to write.
|
||||||
|
|
||||||
|
### Manual
|
||||||
|
|
||||||
|
Edit `second-brain/data/obsidian_config.json` and set:
|
||||||
|
|
||||||
|
- `vault_path` to the vault directory (the parent of `.obsidian/`)
|
||||||
|
|
||||||
|
## 3) Enable ingest/export
|
||||||
|
|
||||||
|
In `second-brain/data/obsidian_config.json`:
|
||||||
|
|
||||||
|
- Set `enabled.ingest` to `true` to ingest vault markdown into Second-Brain
|
||||||
|
- Set `enabled.export` to `true` to export Second-Brain engrams into the vault
|
||||||
|
|
||||||
|
## 4) Enable timers (systemd)
|
||||||
|
|
||||||
|
This repo ships unit files in `systemd/`:
|
||||||
|
|
||||||
|
- `systemd/openclaw-secondbrain-ingest-obsidian.service`
|
||||||
|
- `systemd/openclaw-secondbrain-ingest-obsidian.timer`
|
||||||
|
- `systemd/openclaw-secondbrain-export-obsidian.service`
|
||||||
|
- `systemd/openclaw-secondbrain-export-obsidian.timer`
|
||||||
|
|
||||||
|
Install them (copy or symlink) to `/etc/systemd/system/`, then:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sudo systemctl daemon-reload
|
||||||
|
sudo systemctl enable --now openclaw-secondbrain-ingest-obsidian.timer
|
||||||
|
sudo systemctl enable --now openclaw-secondbrain-export-obsidian.timer
|
||||||
|
```
|
||||||
|
|
||||||
|
## 5) Verify
|
||||||
|
|
||||||
|
Run once manually:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3 openclaw_cron_wrapper.py ingest_obsidian
|
||||||
|
python3 openclaw_cron_wrapper.py export_obsidian
|
||||||
|
```
|
||||||
|
|
||||||
|
What to expect:
|
||||||
|
|
||||||
|
- If `vault_path` is missing/invalid, both tasks **skip** safely (no writes to random paths).
|
||||||
|
- Ingest creates/updates `second-brain/data/obsidian_ingest_state.json`.
|
||||||
|
- Export writes markdown files to `<vault_path>/<export.subdir>/` (default: `SecondBrain/`) and tracks state in `second-brain/data/obsidian_export_state.json`.
|
||||||
|
|
||||||
159
scripts/discover_obsidian_vault.py
Normal file
159
scripts/discover_obsidian_vault.py
Normal file
@@ -0,0 +1,159 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Auto-discover an Obsidian vault on this server and (optionally) write it into:
|
||||||
|
second-brain/data/obsidian_config.json
|
||||||
|
|
||||||
|
Safety:
|
||||||
|
- Only writes when exactly one vault is detected (unambiguous).
|
||||||
|
- A "vault" is a directory that contains a `.obsidian/` folder.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Iterable
|
||||||
|
|
||||||
|
|
||||||
|
WORKSPACE = Path("/root/.openclaw/workspace")
|
||||||
|
BRAIN_DIR = WORKSPACE / "second-brain"
|
||||||
|
CONFIG_PATH = BRAIN_DIR / "data" / "obsidian_config.json"
|
||||||
|
|
||||||
|
|
||||||
|
def _iter_common_candidates() -> Iterable[Path]:
|
||||||
|
env = os.environ.get("OBSIDIAN_VAULT_PATH")
|
||||||
|
if env:
|
||||||
|
yield Path(env).expanduser()
|
||||||
|
|
||||||
|
home = Path.home()
|
||||||
|
for p in [
|
||||||
|
home / "Obsidian",
|
||||||
|
home / "ObsidianVault",
|
||||||
|
home / "Vault",
|
||||||
|
home / "Vaults",
|
||||||
|
home / "Documents" / "Obsidian",
|
||||||
|
home / "Documents" / "Vaults",
|
||||||
|
home / "Syncthing" / "Obsidian",
|
||||||
|
Path("/srv/obsidian"),
|
||||||
|
Path("/srv/Obsidian"),
|
||||||
|
Path("/data/obsidian"),
|
||||||
|
Path("/data/Obsidian"),
|
||||||
|
WORKSPACE / "obsidian",
|
||||||
|
WORKSPACE / "vault",
|
||||||
|
WORKSPACE / "vaults",
|
||||||
|
]:
|
||||||
|
yield p
|
||||||
|
|
||||||
|
|
||||||
|
def _is_vault_dir(p: Path) -> bool:
|
||||||
|
try:
|
||||||
|
return p.exists() and p.is_dir() and (p / ".obsidian").exists() and (p / ".obsidian").is_dir()
|
||||||
|
except Exception:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def _bounded_find_obsidian_dirs(root: Path, *, max_depth: int) -> list[Path]:
|
||||||
|
"""
|
||||||
|
Find `.obsidian` directories below root, limited by depth to keep runtime bounded.
|
||||||
|
"""
|
||||||
|
results: list[Path] = []
|
||||||
|
try:
|
||||||
|
root = root.resolve()
|
||||||
|
except Exception:
|
||||||
|
return results
|
||||||
|
|
||||||
|
if not root.exists() or not root.is_dir():
|
||||||
|
return results
|
||||||
|
|
||||||
|
def depth_of(path: Path) -> int:
|
||||||
|
try:
|
||||||
|
return len(path.relative_to(root).parts)
|
||||||
|
except Exception:
|
||||||
|
return 9999
|
||||||
|
|
||||||
|
# Breadth-first-ish scan with pruning
|
||||||
|
queue = [root]
|
||||||
|
while queue:
|
||||||
|
current = queue.pop(0)
|
||||||
|
if depth_of(current) > max_depth:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
entries = list(current.iterdir())
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
|
||||||
|
for e in entries:
|
||||||
|
name = e.name
|
||||||
|
if name in (".git", "node_modules", "__pycache__", ".cache", ".venv", "venv", "tmp", "proc", "sys", "dev"):
|
||||||
|
continue
|
||||||
|
if name.startswith(".") and name not in (".obsidian",):
|
||||||
|
continue
|
||||||
|
if name == ".obsidian" and e.is_dir():
|
||||||
|
results.append(e)
|
||||||
|
continue
|
||||||
|
if e.is_dir() and not e.is_symlink():
|
||||||
|
queue.append(e)
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def discover(*, roots: list[Path], max_depth: int) -> list[Path]:
|
||||||
|
vaults: set[Path] = set()
|
||||||
|
|
||||||
|
for p in _iter_common_candidates():
|
||||||
|
if _is_vault_dir(p):
|
||||||
|
vaults.add(p.resolve())
|
||||||
|
|
||||||
|
for root in roots:
|
||||||
|
for obsidian_dir in _bounded_find_obsidian_dirs(root, max_depth=max_depth):
|
||||||
|
vaults.add(obsidian_dir.parent.resolve())
|
||||||
|
|
||||||
|
return sorted(vaults)
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> int:
|
||||||
|
ap = argparse.ArgumentParser(description="Discover Obsidian vault and optionally write config")
|
||||||
|
ap.add_argument("--write", action="store_true", help="Write detected vault_path into obsidian_config.json")
|
||||||
|
ap.add_argument(
|
||||||
|
"--roots",
|
||||||
|
nargs="*",
|
||||||
|
default=[str(Path.home()), "/srv", "/data", "/mnt", str(WORKSPACE)],
|
||||||
|
help="Roots to scan (bounded). Default: home,/srv,/data,/mnt,workspace",
|
||||||
|
)
|
||||||
|
ap.add_argument("--max-depth", type=int, default=4, help="Max directory depth to scan under each root")
|
||||||
|
args = ap.parse_args()
|
||||||
|
|
||||||
|
roots = [Path(r).expanduser() for r in args.roots]
|
||||||
|
vaults = discover(roots=roots, max_depth=int(args.max_depth))
|
||||||
|
|
||||||
|
if not vaults:
|
||||||
|
print("No Obsidian vault found (no `.obsidian/` directories detected).")
|
||||||
|
return 1
|
||||||
|
|
||||||
|
if len(vaults) > 1:
|
||||||
|
print("Multiple Obsidian vaults found; refusing to write config:")
|
||||||
|
for v in vaults:
|
||||||
|
print(f"- {v}")
|
||||||
|
return 2
|
||||||
|
|
||||||
|
vault = vaults[0]
|
||||||
|
print(f"Detected Obsidian vault: {vault}")
|
||||||
|
|
||||||
|
if not args.write:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
if not CONFIG_PATH.exists():
|
||||||
|
raise SystemExit(f"Missing config file: {CONFIG_PATH}")
|
||||||
|
|
||||||
|
cfg = json.loads(CONFIG_PATH.read_text(encoding="utf-8"))
|
||||||
|
cfg["vault_path"] = str(vault)
|
||||||
|
CONFIG_PATH.write_text(json.dumps(cfg, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
|
||||||
|
print(f"Wrote vault_path to: {CONFIG_PATH}")
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
raise SystemExit(main())
|
||||||
|
|
||||||
@@ -38,6 +38,12 @@ try:
|
|||||||
except ImportError:
|
except ImportError:
|
||||||
Retriever = None
|
Retriever = None
|
||||||
|
|
||||||
|
# Chroma: optional (braucht chromadb)
|
||||||
|
try:
|
||||||
|
from src.chroma_store import ChromaStore
|
||||||
|
except Exception:
|
||||||
|
ChromaStore = None
|
||||||
|
|
||||||
|
|
||||||
# --- Konfiguration ---
|
# --- Konfiguration ---
|
||||||
BRAIN_DB = Path(__file__).parent.parent / "data" / "brain.sqlite"
|
BRAIN_DB = Path(__file__).parent.parent / "data" / "brain.sqlite"
|
||||||
@@ -207,10 +213,34 @@ def enrich_context(topic: str, limit: int = 3) -> str:
|
|||||||
"""
|
"""
|
||||||
store = get_brain()
|
store = get_brain()
|
||||||
|
|
||||||
# Versuche Retriever (mit Embeddings), fallback auf einfache Textsuche
|
# Versuche Hybrid-Retrieval (FTS + optional Vector), fallback auf Textsuche
|
||||||
if Retriever:
|
if Retriever:
|
||||||
ret = Retriever(store)
|
chroma = None
|
||||||
results = ret.retrieve(topic, limit=limit, min_confidence=0.3)
|
if ChromaStore:
|
||||||
|
try:
|
||||||
|
chroma = ChromaStore(path=str(Path(__file__).parent.parent / "data" / "chroma"))
|
||||||
|
except Exception:
|
||||||
|
chroma = None
|
||||||
|
ret = Retriever(store, chroma=chroma)
|
||||||
|
try:
|
||||||
|
results = ret.hybrid_retrieve(topic, limit=limit * 3, min_confidence=0.3)
|
||||||
|
except Exception:
|
||||||
|
results = ret.retrieve(topic, limit=limit * 3, min_confidence=0.3)
|
||||||
|
|
||||||
|
# confirmed-first ranking
|
||||||
|
def _rank(r):
|
||||||
|
eg = r["engram"]
|
||||||
|
confirmed = 1 if getattr(eg.correctness, "confirmed", False) else 0
|
||||||
|
return (confirmed, float(r.get("score", 0.0)))
|
||||||
|
|
||||||
|
results.sort(key=_rank, reverse=True)
|
||||||
|
|
||||||
|
# If we have confirmed results, show only confirmed up to limit
|
||||||
|
confirmed_only = [r for r in results if r["engram"].correctness.confirmed]
|
||||||
|
if confirmed_only:
|
||||||
|
results = confirmed_only[:limit]
|
||||||
|
else:
|
||||||
|
results = results[:limit]
|
||||||
else:
|
else:
|
||||||
results_raw = store.search_text(topic, limit=limit)
|
results_raw = store.search_text(topic, limit=limit)
|
||||||
results = [{"engram": eg, "score": 0.5} for eg in results_raw]
|
results = [{"engram": eg, "score": 0.5} for eg in results_raw]
|
||||||
|
|||||||
@@ -7,12 +7,10 @@ Phase 2: + Embedding + Fusion.
|
|||||||
from typing import List, Dict, Any, Optional
|
from typing import List, Dict, Any, Optional
|
||||||
from .engram import Engram
|
from .engram import Engram
|
||||||
from .store import EngramStore
|
from .store import EngramStore
|
||||||
from .chroma_store import ChromaStore
|
|
||||||
from .embedder import encode
|
|
||||||
|
|
||||||
|
|
||||||
class Retriever:
|
class Retriever:
|
||||||
def __init__(self, store: EngramStore, chroma: Optional[ChromaStore] = None):
|
def __init__(self, store: EngramStore, chroma: Optional[object] = None):
|
||||||
self.store = store
|
self.store = store
|
||||||
self.chroma = chroma
|
self.chroma = chroma
|
||||||
|
|
||||||
@@ -50,7 +48,6 @@ class Retriever:
|
|||||||
if not self.chroma:
|
if not self.chroma:
|
||||||
return []
|
return []
|
||||||
chroma_results = self.chroma.query(query, top_k=limit * 3)
|
chroma_results = self.chroma.query(query, top_k=limit * 3)
|
||||||
eids = [r["id"] for r in chroma_results]
|
|
||||||
results = []
|
results = []
|
||||||
for r in chroma_results:
|
for r in chroma_results:
|
||||||
eg = self.store.get(r["id"])
|
eg = self.store.get(r["id"])
|
||||||
|
|||||||
15
src/store.py
15
src/store.py
@@ -127,6 +127,14 @@ class EngramStore:
|
|||||||
).fetchall()
|
).fetchall()
|
||||||
return [self._row_to_engram(r) for r in rows]
|
return [self._row_to_engram(r) for r in rows]
|
||||||
|
|
||||||
|
def get_modified_since(self, iso_ts: str, limit: int = 5000) -> List[Engram]:
|
||||||
|
"""Gibt Engramme zurück, deren `modified_at` nach `iso_ts` liegt."""
|
||||||
|
rows = self._conn.execute(
|
||||||
|
"SELECT * FROM engrams WHERE modified_at > ? ORDER BY modified_at ASC LIMIT ?",
|
||||||
|
(iso_ts, limit),
|
||||||
|
).fetchall()
|
||||||
|
return [self._row_to_engram(r) for r in rows]
|
||||||
|
|
||||||
def delete(self, engram_id: str) -> bool:
|
def delete(self, engram_id: str) -> bool:
|
||||||
"""Löscht ein Engramm und alle Verknüpfungen."""
|
"""Löscht ein Engramm und alle Verknüpfungen."""
|
||||||
rowid = self._conn.execute(
|
rowid = self._conn.execute(
|
||||||
@@ -239,6 +247,13 @@ class EngramStore:
|
|||||||
"links": json.loads(row["links_json"]),
|
"links": json.loads(row["links_json"]),
|
||||||
"hierarchy": json.loads(row["hierarchy_json"]),
|
"hierarchy": json.loads(row["hierarchy_json"]),
|
||||||
}
|
}
|
||||||
|
# Keep Engram metadata timestamps aligned with DB columns so downstream
|
||||||
|
# consumers (e.g. vector indexing watermarks) can rely on them.
|
||||||
|
try:
|
||||||
|
d["metadata"]["created"] = row["created_at"]
|
||||||
|
d["metadata"]["modified"] = row["modified_at"]
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
emb = row["embedding_json"]
|
emb = row["embedding_json"]
|
||||||
if emb:
|
if emb:
|
||||||
d["embedding"] = json.loads(emb)
|
d["embedding"] = json.loads(emb)
|
||||||
|
|||||||
@@ -4,16 +4,11 @@
|
|||||||
import sys
|
import sys
|
||||||
import os
|
import os
|
||||||
import tempfile
|
import tempfile
|
||||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "src"))
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
|
||||||
|
|
||||||
try:
|
|
||||||
from src.engram import Engram, Grounding, Correctness
|
from src.engram import Engram, Grounding, Correctness
|
||||||
from src.store import EngramStore
|
from src.store import EngramStore
|
||||||
from src.retriever import Retriever
|
from src.retriever import Retriever
|
||||||
except ImportError:
|
|
||||||
from engram import Engram, Grounding, Correctness
|
|
||||||
from store import EngramStore
|
|
||||||
from retriever import Retriever
|
|
||||||
|
|
||||||
|
|
||||||
def test_engram_creation():
|
def test_engram_creation():
|
||||||
|
|||||||
Reference in New Issue
Block a user