Files
second-brain/cron_tasks/predictive_links.py

85 lines
3.1 KiB
Python

#!/usr/bin/env python3
"""
Erweitert Engramme mit predictive linking: sucht nach ähnlichen Inhalten
(basierend auf Tag-Überlappung und Keyword-Matching) und speichert Vorschläge.
"""
from __future__ import annotations
import json
import re
import sqlite3
import sys
from collections import Counter
from datetime import datetime, timezone
from pathlib import Path
BRAIN_DIR = Path("/root/.openclaw/workspace/second-brain")
DB_PATH = BRAIN_DIR / "data" / "brain.sqlite"
def extract_keywords(text: str, max_words: int = 10) -> set[str]:
# Einfache Keyword-Extraktion: Wörter > 3 Buchstaben, lowercase
words = re.findall(r"\b[a-zA-Z]{4,}\b", text.lower())
# Stopwörter filtern (einfache Liste)
stopwords = {"und", "die", "der", "ein", "eine", "auf", "von", "zu", "mit", "für", "ist", "das", "nicht"}
return set(w for w in words if w not in stopwords)[:max_words]
def run():
conn = sqlite3.connect(str(DB_PATH))
conn.row_factory = sqlite3.Row
c = conn.cursor()
# Alle Engramme laden (begrenzt für Performance)
c.execute("SELECT id, content, metadata_json FROM engrams ORDER BY created_at DESC LIMIT 2000")
rows = c.fetchall()
engrams = []
for r in rows:
meta = json.loads(r["metadata_json"] or "{}")
engrams.append({
"id": r["id"],
"content": r["content"],
"tags": set(meta.get("tags", [])),
"keywords": extract_keywords(r["content"]),
"source": meta.get("source"),
})
updated = 0
for i, eg in enumerate(engrams):
# Ähnliche finden durch Tag-Überlappung und Keyword-Jaccard
candidates = []
for other in engrams:
if other["id"] == eg["id"]:
continue
# Tag-Overlap
tag_overlap = len(eg["tags"] & other["tags"])
# Keyword-Jaccard
kw_intersection = len(eg["keywords"] & other["keywords"])
kw_union = len(eg["keywords"] | other["keywords"])
kw_jaccard = kw_intersection / kw_union if kw_union > 0 else 0
score = tag_overlap * 2 + kw_jaccard * 5
if score > 1.0:
candidates.append((other["id"], score, list(eg["tags"] & other["tags"]), list(eg["keywords"] & other["keywords"])))
candidates.sort(key=lambda x: x[1], reverse=True)
top5 = candidates[:5]
if top5:
# In metadata speichern
meta = json.loads(rows[i]["metadata_json"] or "{}")
meta["predictive_links"] = [{"engram_id": cid, "score": round(s, 2), "common_tags": ct, "common_keywords": ck} for cid, s, ct, ck in top5]
c.execute("UPDATE engrams SET metadata_json = ?, modified_at = ? WHERE id = ?",
(json.dumps(meta), datetime.now(timezone.utc).isoformat(), eg["id"]))
updated += 1
conn.commit()
conn.close()
print(json.dumps({
"success": True,
"time": datetime.now(timezone.utc).isoformat(),
"engrams_processed": len(engrams),
"engrams_updated": updated,
}, indent=2, ensure_ascii=False))
if __name__ == "__main__":
run()