feat: add proactive cron tasks and systemd timers\n\n- 10 proactive tasks: ingest with self-healing & link suggestions, daily summary, health check, archive stale, tag normalizer, predictive links, auto assign review, import context buffer\n- systemd timers for scheduling (02:00/14:00 slots, 30min intervals, weekly)\n- all tasks tested and working\n\nRefs: #1
This commit is contained in:
84
cron_tasks/predictive_links.py
Normal file
84
cron_tasks/predictive_links.py
Normal file
@@ -0,0 +1,84 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Erweitert Engramme mit predictive linking: sucht nach ähnlichen Inhalten
|
||||
(basierend auf Tag-Überlappung und Keyword-Matching) und speichert Vorschläge.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import re
|
||||
import sqlite3
|
||||
import sys
|
||||
from collections import Counter
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
BRAIN_DIR = Path("/root/.openclaw/workspace/second-brain")
|
||||
DB_PATH = BRAIN_DIR / "data" / "brain.sqlite"
|
||||
|
||||
def extract_keywords(text: str, max_words: int = 10) -> set[str]:
|
||||
# Einfache Keyword-Extraktion: Wörter > 3 Buchstaben, lowercase
|
||||
words = re.findall(r"\b[a-zA-Z]{4,}\b", text.lower())
|
||||
# Stopwörter filtern (einfache Liste)
|
||||
stopwords = {"und", "die", "der", "ein", "eine", "auf", "von", "zu", "mit", "für", "ist", "das", "nicht"}
|
||||
return set(w for w in words if w not in stopwords)[:max_words]
|
||||
|
||||
def run():
|
||||
conn = sqlite3.connect(str(DB_PATH))
|
||||
conn.row_factory = sqlite3.Row
|
||||
c = conn.cursor()
|
||||
|
||||
# Alle Engramme laden (begrenzt für Performance)
|
||||
c.execute("SELECT id, content, metadata_json FROM engrams ORDER BY created_at DESC LIMIT 2000")
|
||||
rows = c.fetchall()
|
||||
|
||||
engrams = []
|
||||
for r in rows:
|
||||
meta = json.loads(r["metadata_json"] or "{}")
|
||||
engrams.append({
|
||||
"id": r["id"],
|
||||
"content": r["content"],
|
||||
"tags": set(meta.get("tags", [])),
|
||||
"keywords": extract_keywords(r["content"]),
|
||||
"source": meta.get("source"),
|
||||
})
|
||||
|
||||
updated = 0
|
||||
for i, eg in enumerate(engrams):
|
||||
# Ähnliche finden durch Tag-Überlappung und Keyword-Jaccard
|
||||
candidates = []
|
||||
for other in engrams:
|
||||
if other["id"] == eg["id"]:
|
||||
continue
|
||||
# Tag-Overlap
|
||||
tag_overlap = len(eg["tags"] & other["tags"])
|
||||
# Keyword-Jaccard
|
||||
kw_intersection = len(eg["keywords"] & other["keywords"])
|
||||
kw_union = len(eg["keywords"] | other["keywords"])
|
||||
kw_jaccard = kw_intersection / kw_union if kw_union > 0 else 0
|
||||
score = tag_overlap * 2 + kw_jaccard * 5
|
||||
if score > 1.0:
|
||||
candidates.append((other["id"], score, list(eg["tags"] & other["tags"]), list(eg["keywords"] & other["keywords"])))
|
||||
candidates.sort(key=lambda x: x[1], reverse=True)
|
||||
top5 = candidates[:5]
|
||||
if top5:
|
||||
# In metadata speichern
|
||||
meta = json.loads(rows[i]["metadata_json"] or "{}")
|
||||
meta["predictive_links"] = [{"engram_id": cid, "score": round(s, 2), "common_tags": ct, "common_keywords": ck} for cid, s, ct, ck in top5]
|
||||
c.execute("UPDATE engrams SET metadata_json = ?, modified_at = ? WHERE id = ?",
|
||||
(json.dumps(meta), datetime.now(timezone.utc).isoformat(), eg["id"]))
|
||||
updated += 1
|
||||
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
print(json.dumps({
|
||||
"success": True,
|
||||
"time": datetime.now(timezone.utc).isoformat(),
|
||||
"engrams_processed": len(engrams),
|
||||
"engrams_updated": updated,
|
||||
}, indent=2, ensure_ascii=False))
|
||||
|
||||
if __name__ == "__main__":
|
||||
run()
|
||||
Reference in New Issue
Block a user