second-brain/cron_tasks/index_vectors.py

#!/usr/bin/env python3
"""
Index Engrams into Chroma vector store for semantic search.
"""

from __future__ import annotations

import json
import sys
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Dict, List

BRAIN_DIR = Path("/root/.openclaw/workspace/second-brain")
sys.path.insert(0, str(BRAIN_DIR))
from src.store import EngramStore
from src.chroma_store import ChromaStore

DB_PATH = BRAIN_DIR / "data" / "brain.sqlite"
CHROMA_DIR = BRAIN_DIR / "data" / "chroma"


def run() -> Dict[str, Any]:
    store = EngramStore(str(DB_PATH))
    chroma = ChromaStore(str(CHROMA_DIR))

    out = {
        "success": True,
        "time": datetime.now(timezone.utc).isoformat(),
        "indexed": 0,
        "skipped": 0,
        "errors": [],
    }

    # Get all engram IDs from SQL DB
    rows = store._conn.execute("SELECT id FROM engrams").fetchall()
    all_ids = [row[0] for row in rows]
    # Get existing IDs from Chroma
    existing = set(chroma.collection.get(include=[])["ids"])

    for eg_id in all_ids:
        try:
            if eg_id in existing:
                out["skipped"] += 1
                continue
            eg = store.get(eg_id)
            if eg is None:
                out["errors"].append(f"{eg_id}: not found in store")
                continue
            chroma.add(eg)
            out["indexed"] += 1
        except Exception as e:
            out["errors"].append(f"{eg_id}: {e}")

    return out


if __name__ == "__main__":
    res = run()
    print(json.dumps(res, ensure_ascii=False, indent=2))