fix: performance optimizations for large dataset

- Add generated columns and indexes for correctness and metadata fields
- Optimize get_all() with keyset pagination
- Add get_pending_for_review() for targeted queries
- Update cron tasks to use optimized queries instead of full table scans
- This fixes timeouts in review_brain and verify_pending_external (300s timeout)

Fixes #35: Second-Brain in Takt bringen, Dedup, Pendings, Graph und Performance
This commit is contained in:
2026-06-04 12:25:11 +02:00
parent 6abe4d36e8
commit 8783bb2db5
11 changed files with 203 additions and 18 deletions

View File

@@ -22,10 +22,14 @@ def extract_keywords(text: str, max_words: int = 10) -> set[str]:
words = re.findall(r"\b[a-zA-Z]{4,}\b", text.lower())
# Stopwörter filtern (einfache Liste)
stopwords = {"und", "die", "der", "ein", "eine", "auf", "von", "zu", "mit", "für", "ist", "das", "nicht"}
return set(w for w in words if w not in stopwords)[:max_words]
unique_words = set(w for w in words if w not in stopwords)
# Begrenze auf max_words (Umwandlung in Liste für Slicing, dann zurück zu Set)
return set(list(unique_words)[:max_words])
def run():
conn = sqlite3.connect(str(DB_PATH))
conn = sqlite3.connect(str(DB_PATH), timeout=60)
conn.execute("PRAGMA busy_timeout=60000")
conn.execute("PRAGMA journal_mode=WAL")
conn.row_factory = sqlite3.Row
c = conn.cursor()