feat: add proactive cron tasks and systemd timers\n\n- 10 proactive tasks: ingest with self-healing & link suggestions, daily summary, health check, archive stale, tag normalizer, predictive links, auto assign review, import context buffer\n- systemd timers for scheduling (02:00/14:00 slots, 30min intervals, weekly)\n- all tasks tested and working\n\nRefs: #1

2026-05-31 13:53:51 +02:00
parent a261f5b9e1
commit 0c72e4d9fa
30 changed files with 1361 additions and 0 deletions
--- a/cron_tasks/predictive_links.py
+++ b/cron_tasks/predictive_links.py
@@ -0,0 +1,84 @@
+#!/usr/bin/env python3
+"""
+Erweitert Engramme mit predictive linking: sucht nach ähnlichen Inhalten
+(basierend auf Tag-Überlappung und Keyword-Matching) und speichert Vorschläge.
+"""
+
+from __future__ import annotations
+
+import json
+import re
+import sqlite3
+import sys
+from collections import Counter
+from datetime import datetime, timezone
+from pathlib import Path
+
+BRAIN_DIR = Path("/root/.openclaw/workspace/second-brain")
+DB_PATH = BRAIN_DIR / "data" / "brain.sqlite"
+
+def extract_keywords(text: str, max_words: int = 10) -> set[str]:
+    # Einfache Keyword-Extraktion: Wörter > 3 Buchstaben, lowercase
+    words = re.findall(r"\b[a-zA-Z]{4,}\b", text.lower())
+    # Stopwörter filtern (einfache Liste)
+    stopwords = {"und", "die", "der", "ein", "eine", "auf", "von", "zu", "mit", "für", "ist", "das", "nicht"}
+    return set(w for w in words if w not in stopwords)[:max_words]
+
+def run():
+    conn = sqlite3.connect(str(DB_PATH))
+    conn.row_factory = sqlite3.Row
+    c = conn.cursor()
+
+    # Alle Engramme laden (begrenzt für Performance)
+    c.execute("SELECT id, content, metadata_json FROM engrams ORDER BY created_at DESC LIMIT 2000")
+    rows = c.fetchall()
+
+    engrams = []
+    for r in rows:
+        meta = json.loads(r["metadata_json"] or "{}")
+        engrams.append({
+            "id": r["id"],
+            "content": r["content"],
+            "tags": set(meta.get("tags", [])),
+            "keywords": extract_keywords(r["content"]),
+            "source": meta.get("source"),
+        })
+
+    updated = 0
+    for i, eg in enumerate(engrams):
+        # Ähnliche finden durch Tag-Überlappung und Keyword-Jaccard
+        candidates = []
+        for other in engrams:
+            if other["id"] == eg["id"]:
+                continue
+            # Tag-Overlap
+            tag_overlap = len(eg["tags"] & other["tags"])
+            # Keyword-Jaccard
+            kw_intersection = len(eg["keywords"] & other["keywords"])
+            kw_union = len(eg["keywords"] | other["keywords"])
+            kw_jaccard = kw_intersection / kw_union if kw_union > 0 else 0
+            score = tag_overlap * 2 + kw_jaccard * 5
+            if score > 1.0:
+                candidates.append((other["id"], score, list(eg["tags"] & other["tags"]), list(eg["keywords"] & other["keywords"])))
+        candidates.sort(key=lambda x: x[1], reverse=True)
+        top5 = candidates[:5]
+        if top5:
+            # In metadata speichern
+            meta = json.loads(rows[i]["metadata_json"] or "{}")
+            meta["predictive_links"] = [{"engram_id": cid, "score": round(s, 2), "common_tags": ct, "common_keywords": ck} for cid, s, ct, ck in top5]
+            c.execute("UPDATE engrams SET metadata_json = ?, modified_at = ? WHERE id = ?",
+                      (json.dumps(meta), datetime.now(timezone.utc).isoformat(), eg["id"]))
+            updated += 1
+
+    conn.commit()
+    conn.close()
+
+    print(json.dumps({
+        "success": True,
+        "time": datetime.now(timezone.utc).isoformat(),
+        "engrams_processed": len(engrams),
+        "engrams_updated": updated,
+    }, indent=2, ensure_ascii=False))
+
+if __name__ == "__main__":
+    run()