85 lines
3.1 KiB
Python
85 lines
3.1 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Erweitert Engramme mit predictive linking: sucht nach ähnlichen Inhalten
|
|
(basierend auf Tag-Überlappung und Keyword-Matching) und speichert Vorschläge.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import re
|
|
import sqlite3
|
|
import sys
|
|
from collections import Counter
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
|
|
BRAIN_DIR = Path("/root/.openclaw/workspace/second-brain")
|
|
DB_PATH = BRAIN_DIR / "data" / "brain.sqlite"
|
|
|
|
def extract_keywords(text: str, max_words: int = 10) -> set[str]:
|
|
# Einfache Keyword-Extraktion: Wörter > 3 Buchstaben, lowercase
|
|
words = re.findall(r"\b[a-zA-Z]{4,}\b", text.lower())
|
|
# Stopwörter filtern (einfache Liste)
|
|
stopwords = {"und", "die", "der", "ein", "eine", "auf", "von", "zu", "mit", "für", "ist", "das", "nicht"}
|
|
return set(w for w in words if w not in stopwords)[:max_words]
|
|
|
|
def run():
|
|
conn = sqlite3.connect(str(DB_PATH))
|
|
conn.row_factory = sqlite3.Row
|
|
c = conn.cursor()
|
|
|
|
# Alle Engramme laden (begrenzt für Performance)
|
|
c.execute("SELECT id, content, metadata_json FROM engrams ORDER BY created_at DESC LIMIT 2000")
|
|
rows = c.fetchall()
|
|
|
|
engrams = []
|
|
for r in rows:
|
|
meta = json.loads(r["metadata_json"] or "{}")
|
|
engrams.append({
|
|
"id": r["id"],
|
|
"content": r["content"],
|
|
"tags": set(meta.get("tags", [])),
|
|
"keywords": extract_keywords(r["content"]),
|
|
"source": meta.get("source"),
|
|
})
|
|
|
|
updated = 0
|
|
for i, eg in enumerate(engrams):
|
|
# Ähnliche finden durch Tag-Überlappung und Keyword-Jaccard
|
|
candidates = []
|
|
for other in engrams:
|
|
if other["id"] == eg["id"]:
|
|
continue
|
|
# Tag-Overlap
|
|
tag_overlap = len(eg["tags"] & other["tags"])
|
|
# Keyword-Jaccard
|
|
kw_intersection = len(eg["keywords"] & other["keywords"])
|
|
kw_union = len(eg["keywords"] | other["keywords"])
|
|
kw_jaccard = kw_intersection / kw_union if kw_union > 0 else 0
|
|
score = tag_overlap * 2 + kw_jaccard * 5
|
|
if score > 1.0:
|
|
candidates.append((other["id"], score, list(eg["tags"] & other["tags"]), list(eg["keywords"] & other["keywords"])))
|
|
candidates.sort(key=lambda x: x[1], reverse=True)
|
|
top5 = candidates[:5]
|
|
if top5:
|
|
# In metadata speichern
|
|
meta = json.loads(rows[i]["metadata_json"] or "{}")
|
|
meta["predictive_links"] = [{"engram_id": cid, "score": round(s, 2), "common_tags": ct, "common_keywords": ck} for cid, s, ct, ck in top5]
|
|
c.execute("UPDATE engrams SET metadata_json = ?, modified_at = ? WHERE id = ?",
|
|
(json.dumps(meta), datetime.now(timezone.utc).isoformat(), eg["id"]))
|
|
updated += 1
|
|
|
|
conn.commit()
|
|
conn.close()
|
|
|
|
print(json.dumps({
|
|
"success": True,
|
|
"time": datetime.now(timezone.utc).isoformat(),
|
|
"engrams_processed": len(engrams),
|
|
"engrams_updated": updated,
|
|
}, indent=2, ensure_ascii=False))
|
|
|
|
if __name__ == "__main__":
|
|
run()
|