second-brain/cron_tasks/predictive_links.py

#!/usr/bin/env python3
"""
Erweitert Engramme mit predictive linking: sucht nach ähnlichen Inhalten
(basierend auf Tag-Überlappung und Keyword-Matching) und speichert Vorschläge.
"""

from __future__ import annotations

import json
import re
import sqlite3
import sys
from collections import Counter
from datetime import datetime, timezone
from pathlib import Path

BRAIN_DIR = Path("/root/.openclaw/workspace/second-brain")
DB_PATH = BRAIN_DIR / "data" / "brain.sqlite"

def extract_keywords(text: str, max_words: int = 10) -> set[str]:
    # Einfache Keyword-Extraktion: Wörter > 3 Buchstaben, lowercase
    words = re.findall(r"\b[a-zA-Z]{4,}\b", text.lower())
    # Stopwörter filtern (einfache Liste)
    stopwords = {"und", "die", "der", "ein", "eine", "auf", "von", "zu", "mit", "für", "ist", "das", "nicht"}
    return set(w for w in words if w not in stopwords)[:max_words]

def run():
    conn = sqlite3.connect(str(DB_PATH))
    conn.row_factory = sqlite3.Row
    c = conn.cursor()

    # Alle Engramme laden (begrenzt für Performance)
    c.execute("SELECT id, content, metadata_json FROM engrams ORDER BY created_at DESC LIMIT 2000")
    rows = c.fetchall()

    engrams = []
    for r in rows:
        meta = json.loads(r["metadata_json"] or "{}")
        engrams.append({
            "id": r["id"],
            "content": r["content"],
            "tags": set(meta.get("tags", [])),
            "keywords": extract_keywords(r["content"]),
            "source": meta.get("source"),
        })

    updated = 0
    for i, eg in enumerate(engrams):
        # Ähnliche finden durch Tag-Überlappung und Keyword-Jaccard
        candidates = []
        for other in engrams:
            if other["id"] == eg["id"]:
                continue
            # Tag-Overlap
            tag_overlap = len(eg["tags"] & other["tags"])
            # Keyword-Jaccard
            kw_intersection = len(eg["keywords"] & other["keywords"])
            kw_union = len(eg["keywords"] | other["keywords"])
            kw_jaccard = kw_intersection / kw_union if kw_union > 0 else 0
            score = tag_overlap * 2 + kw_jaccard * 5
            if score > 1.0:
                candidates.append((other["id"], score, list(eg["tags"] & other["tags"]), list(eg["keywords"] & other["keywords"])))
        candidates.sort(key=lambda x: x[1], reverse=True)
        top5 = candidates[:5]
        if top5:
            # In metadata speichern
            meta = json.loads(rows[i]["metadata_json"] or "{}")
            meta["predictive_links"] = [{"engram_id": cid, "score": round(s, 2), "common_tags": ct, "common_keywords": ck} for cid, s, ct, ck in top5]
            c.execute("UPDATE engrams SET metadata_json = ?, modified_at = ? WHERE id = ?",
                      (json.dumps(meta), datetime.now(timezone.utc).isoformat(), eg["id"]))
            updated += 1

    conn.commit()
    conn.close()

    print(json.dumps({
        "success": True,
        "time": datetime.now(timezone.utc).isoformat(),
        "engrams_processed": len(engrams),
        "engrams_updated": updated,
    }, indent=2, ensure_ascii=False))

if __name__ == "__main__":
    run()