Files
second-brain/cron_tasks/index_vectors.py

61 lines
1.5 KiB
Python

#!/usr/bin/env python3
"""
Index Engrams into Chroma vector store for semantic search.
"""
from __future__ import annotations
import json
import sys
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Dict, List
BRAIN_DIR = Path("/root/.openclaw/workspace/second-brain")
sys.path.insert(0, str(BRAIN_DIR))
from src.store import EngramStore
from src.chroma_store import ChromaStore
DB_PATH = BRAIN_DIR / "data" / "brain.sqlite"
CHROMA_DIR = BRAIN_DIR / "data" / "chroma"
def run() -> Dict[str, Any]:
store = EngramStore(str(DB_PATH))
chroma = ChromaStore(str(CHROMA_DIR))
out = {
"success": True,
"time": datetime.now(timezone.utc).isoformat(),
"indexed": 0,
"skipped": 0,
"errors": [],
}
# Get all engram IDs from SQL DB
rows = store._conn.execute("SELECT id FROM engrams").fetchall()
all_ids = [row[0] for row in rows]
# Get existing IDs from Chroma
existing = set(chroma.collection.get(include=[])["ids"])
for eg_id in all_ids:
try:
if eg_id in existing:
out["skipped"] += 1
continue
eg = store.get(eg_id)
if eg is None:
out["errors"].append(f"{eg_id}: not found in store")
continue
chroma.add(eg)
out["indexed"] += 1
except Exception as e:
out["errors"].append(f"{eg_id}: {e}")
return out
if __name__ == "__main__":
res = run()
print(json.dumps(res, ensure_ascii=False, indent=2))