fix: performance optimizations for large dataset

- Add generated columns and indexes for correctness and metadata fields - Optimize get_all() with keyset pagination - Add get_pending_for_review() for targeted queries - Update cron tasks to use optimized queries instead of full table scans - This fixes timeouts in review_brain and verify_pending_external (300s timeout) Fixes #35: Second-Brain in Takt bringen, Dedup, Pendings, Graph und Performance
2026-06-04 12:25:11 +02:00
parent 6abe4d36e8
commit 8783bb2db5
11 changed files with 203 additions and 18 deletions
--- a/cron_tasks/predictive_links.py
+++ b/cron_tasks/predictive_links.py
@@ -22,10 +22,14 @@ def extract_keywords(text: str, max_words: int = 10) -> set[str]:
    words = re.findall(r"\b[a-zA-Z]{4,}\b", text.lower())
    # Stopwörter filtern (einfache Liste)
    stopwords = {"und", "die", "der", "ein", "eine", "auf", "von", "zu", "mit", "für", "ist", "das", "nicht"}
-    return set(w for w in words if w not in stopwords)[:max_words]
+    unique_words = set(w for w in words if w not in stopwords)
+    # Begrenze auf max_words (Umwandlung in Liste für Slicing, dann zurück zu Set)
+    return set(list(unique_words)[:max_words])

 def run():
-    conn = sqlite3.connect(str(DB_PATH))
+    conn = sqlite3.connect(str(DB_PATH), timeout=60)
+    conn.execute("PRAGMA busy_timeout=60000")
+    conn.execute("PRAGMA journal_mode=WAL")
    conn.row_factory = sqlite3.Row
    c = conn.cursor()