#!/usr/bin/env python3 """ Erweitert Engramme mit predictive linking: sucht nach ähnlichen Inhalten (basierend auf Tag-Überlappung und Keyword-Matching) und speichert Vorschläge. """ from __future__ import annotations import json import re import sqlite3 import sys from collections import Counter from datetime import datetime, timezone from pathlib import Path BRAIN_DIR = Path("/root/.openclaw/workspace/second-brain") DB_PATH = BRAIN_DIR / "data" / "brain.sqlite" def extract_keywords(text: str, max_words: int = 10) -> set[str]: # Einfache Keyword-Extraktion: Wörter > 3 Buchstaben, lowercase words = re.findall(r"\b[a-zA-Z]{4,}\b", text.lower()) # Stopwörter filtern (einfache Liste) stopwords = {"und", "die", "der", "ein", "eine", "auf", "von", "zu", "mit", "für", "ist", "das", "nicht"} return set(w for w in words if w not in stopwords)[:max_words] def run(): conn = sqlite3.connect(str(DB_PATH)) conn.row_factory = sqlite3.Row c = conn.cursor() # Alle Engramme laden (begrenzt für Performance) c.execute("SELECT id, content, metadata_json FROM engrams ORDER BY created_at DESC LIMIT 2000") rows = c.fetchall() engrams = [] for r in rows: meta = json.loads(r["metadata_json"] or "{}") engrams.append({ "id": r["id"], "content": r["content"], "tags": set(meta.get("tags", [])), "keywords": extract_keywords(r["content"]), "source": meta.get("source"), }) updated = 0 for i, eg in enumerate(engrams): # Ähnliche finden durch Tag-Überlappung und Keyword-Jaccard candidates = [] for other in engrams: if other["id"] == eg["id"]: continue # Tag-Overlap tag_overlap = len(eg["tags"] & other["tags"]) # Keyword-Jaccard kw_intersection = len(eg["keywords"] & other["keywords"]) kw_union = len(eg["keywords"] | other["keywords"]) kw_jaccard = kw_intersection / kw_union if kw_union > 0 else 0 score = tag_overlap * 2 + kw_jaccard * 5 if score > 1.0: candidates.append((other["id"], score, list(eg["tags"] & other["tags"]), list(eg["keywords"] & other["keywords"]))) candidates.sort(key=lambda x: x[1], reverse=True) top5 = candidates[:5] if top5: # In metadata speichern meta = json.loads(rows[i]["metadata_json"] or "{}") meta["predictive_links"] = [{"engram_id": cid, "score": round(s, 2), "common_tags": ct, "common_keywords": ck} for cid, s, ct, ck in top5] c.execute("UPDATE engrams SET metadata_json = ?, modified_at = ? WHERE id = ?", (json.dumps(meta), datetime.now(timezone.utc).isoformat(), eg["id"])) updated += 1 conn.commit() conn.close() print(json.dumps({ "success": True, "time": datetime.now(timezone.utc).isoformat(), "engrams_processed": len(engrams), "engrams_updated": updated, }, indent=2, ensure_ascii=False)) if __name__ == "__main__": run()