From 6d99c520e6cbc63acfeffe6cf51982ddfbc292ae Mon Sep 17 00:00:00 2001 From: Otto Date: Fri, 29 May 2026 11:30:24 +0200 Subject: [PATCH] feat: add verdict/evidence verification model --- cron_tasks/verify_pending_external.py | 36 +------ fastapi_app.py | 145 +++++++++++++++++++++++--- src/engram.py | 63 ++++++++++- static/style.css | 18 ++++ templates/dashboard.html | 16 +++ 5 files changed, 227 insertions(+), 51 deletions(-) diff --git a/cron_tasks/verify_pending_external.py b/cron_tasks/verify_pending_external.py index b1ee909..24c11a1 100755 --- a/cron_tasks/verify_pending_external.py +++ b/cron_tasks/verify_pending_external.py @@ -78,7 +78,7 @@ def main() -> int: pending = [ eg for eg in all_egs - if (not eg.correctness.confirmed and eg.correctness.rejections == 0) + if (not eg.correctness.is_final()) ] confirmed = 0 @@ -94,16 +94,7 @@ def main() -> int: if src == "session" and ( content.startswith("Session Summary (sess_") or content.startswith("Please remember ") ): - eg.correctness.rejections += 1 - eg.correctness.last_reviewed = _now() - eg.correctness.review_history.append( - ReviewEntry( - by="verify-pending", - action="reject", - at=_now(), - note="Auto-reject: session placeholder", - ) - ) + eg.correctness.reject("verify-pending", "Auto-reject: session placeholder") store.save(eg) rejected += 1 continue @@ -118,30 +109,11 @@ def main() -> int: still_pending += 1 continue if 200 <= status < 300: - eg.correctness.confirmed = True - eg.correctness.confirmations += 1 - eg.correctness.last_reviewed = _now() - eg.correctness.review_history.append( - ReviewEntry( - by="verify-pending", - action="confirm", - at=_now(), - note=f"Auto-confirm: web url ok ({status}) {url}", - ) - ) + eg.correctness.confirm("verify-pending", f"Auto-confirm: web url ok ({status}) {url}") store.save(eg) confirmed += 1 else: - eg.correctness.rejections += 1 - eg.correctness.last_reviewed = _now() - eg.correctness.review_history.append( - ReviewEntry( - by="verify-pending", - action="reject", - at=_now(), - note=f"Auto-reject: web url status={status} {url}", - ) - ) + eg.correctness.reject("verify-pending", f"Auto-reject: web url status={status} {url}") store.save(eg) rejected += 1 continue diff --git a/fastapi_app.py b/fastapi_app.py index 098714e..b5feed6 100644 --- a/fastapi_app.py +++ b/fastapi_app.py @@ -54,11 +54,22 @@ def get_db(): def parse_engram(row: sqlite3.Row) -> dict: meta = json.loads(row["metadata_json"] or "{}") correctness = json.loads(row["correctness_json"] or "{}") + verdict = correctness.get("verdict") + if not isinstance(verdict, str) or not verdict: + # Back-compat inference for older rows + if correctness.get("confirmed", False): + verdict = "confirmed_true" + elif int(correctness.get("rejections", 0) or 0) > 0: + verdict = "confirmed_false" + else: + verdict = "unknown" return { "id": row["id"], "content": row["content"], "confidence": meta.get("confidence", 0.0), "confirmed": correctness.get("confirmed", False), + "verdict": verdict, + "evidence": correctness.get("evidence", []), "confirmations": correctness.get("confirmations", 0), "rejections": correctness.get("rejections", 0), "tags": meta.get("tags", []), @@ -88,6 +99,8 @@ def _update_correctness(engram_id: str, *, action: str, reason: str | None = Non raise FileNotFoundError(f"Engram not found: {engram_id}") corr = json.loads(row["correctness_json"] or "{}") + corr.setdefault("verdict", None) + corr.setdefault("evidence", []) corr.setdefault("confirmed", False) corr.setdefault("confirmations", 0) corr.setdefault("rejections", 0) @@ -106,10 +119,30 @@ def _update_correctness(engram_id: str, *, action: str, reason: str | None = Non corr["review_history"] = [entry] if action == "confirm": + corr["verdict"] = "confirmed_true" corr["confirmed"] = True corr["confirmations"] = int(corr.get("confirmations", 0) or 0) + 1 elif action == "reject": + corr["verdict"] = "confirmed_false" corr["rejections"] = int(corr.get("rejections", 0) or 0) + 1 + corr["confirmed"] = False + + # Store minimal evidence for dashboard-driven actions. + try: + ev = corr.get("evidence") + if not isinstance(ev, list): + ev = [] + ev.append( + { + "kind": "human", + "by": "dashboard", + "at": corr["last_reviewed"], + "action": action, + } + ) + corr["evidence"] = ev[-50:] # cap growth + except Exception: + pass c.execute( "UPDATE engrams SET correctness_json = ?, modified_at = ? WHERE id = ?", @@ -232,8 +265,25 @@ def api_storage_stats(): conn = get_db() c = conn.cursor() total = c.execute("SELECT COUNT(*) FROM engrams").fetchone()[0] - confirmed = c.execute( - "SELECT COUNT(*) FROM engrams WHERE json_extract(correctness_json, '$.confirmed') = 1" + confirmed_true = c.execute( + """ + SELECT COUNT(*) FROM engrams + WHERE ( + json_extract(correctness_json, '$.verdict') = 'confirmed_true' + OR (json_extract(correctness_json, '$.verdict') IS NULL AND json_extract(correctness_json, '$.confirmed') = 1) + ) + """ + ).fetchone()[0] + confirmed_false = c.execute( + """ + SELECT COUNT(*) FROM engrams + WHERE ( + json_extract(correctness_json, '$.verdict') = 'confirmed_false' + OR (json_extract(correctness_json, '$.verdict') IS NULL + AND json_extract(correctness_json, '$.confirmed') = 0 + AND COALESCE(json_extract(correctness_json, '$.rejections'), 0) > 0) + ) + """ ).fetchone()[0] sources = { r[0]: r[1] @@ -268,8 +318,9 @@ def api_storage_stats(): return { "sql": { "total_engrams": total, - "confirmed": confirmed, - "pending": total - confirmed, + "confirmed": confirmed_true, + "rejected": confirmed_false, + "pending": total - confirmed_true - confirmed_false, "by_source": sources, }, "vector": { @@ -310,10 +361,27 @@ def api_insights(limit: int = Query(8, ge=1, le=50)): "SELECT id, metadata_json, correctness_json, created_at, modified_at FROM engrams ORDER BY created_at DESC LIMIT 2000" ).fetchall() total = c.execute("SELECT COUNT(*) FROM engrams").fetchone()[0] - confirmed = c.execute( - "SELECT COUNT(*) FROM engrams WHERE json_extract(correctness_json, '$.confirmed') = 1" + confirmed_true = c.execute( + """ + SELECT COUNT(*) FROM engrams + WHERE ( + json_extract(correctness_json, '$.verdict') = 'confirmed_true' + OR (json_extract(correctness_json, '$.verdict') IS NULL AND json_extract(correctness_json, '$.confirmed') = 1) + ) + """ ).fetchone()[0] - pending = total - confirmed + confirmed_false = c.execute( + """ + SELECT COUNT(*) FROM engrams + WHERE ( + json_extract(correctness_json, '$.verdict') = 'confirmed_false' + OR (json_extract(correctness_json, '$.verdict') IS NULL + AND json_extract(correctness_json, '$.confirmed') = 0 + AND COALESCE(json_extract(correctness_json, '$.rejections'), 0) > 0) + ) + """ + ).fetchone()[0] + pending = total - confirmed_true - confirmed_false tag_counts: dict[str, int] = {} source_counts: dict[str, int] = {} @@ -488,10 +556,27 @@ def api_stats(): conn = get_db() c = conn.cursor() total = c.execute("SELECT COUNT(*) FROM engrams").fetchone()[0] - confirmed = c.execute( - "SELECT COUNT(*) FROM engrams WHERE json_extract(correctness_json, '$.confirmed') = 1" + confirmed_true = c.execute( + """ + SELECT COUNT(*) FROM engrams + WHERE ( + json_extract(correctness_json, '$.verdict') = 'confirmed_true' + OR (json_extract(correctness_json, '$.verdict') IS NULL AND json_extract(correctness_json, '$.confirmed') = 1) + ) + """ ).fetchone()[0] - pending = total - confirmed + confirmed_false = c.execute( + """ + SELECT COUNT(*) FROM engrams + WHERE ( + json_extract(correctness_json, '$.verdict') = 'confirmed_false' + OR (json_extract(correctness_json, '$.verdict') IS NULL + AND json_extract(correctness_json, '$.confirmed') = 0 + AND COALESCE(json_extract(correctness_json, '$.rejections'), 0) > 0) + ) + """ + ).fetchone()[0] + pending = total - confirmed_true - confirmed_false errors = c.execute( "SELECT COUNT(*) FROM engrams WHERE json_extract(metadata_json, '$.tags') LIKE '%error%'" ).fetchone()[0] @@ -501,7 +586,8 @@ def api_stats(): conn.close() return { "total": total, - "confirmed": confirmed, + "confirmed": confirmed_true, + "rejected": confirmed_false, "pending": pending, "errors": errors, "avg_confidence": round(avg_conf, 2), @@ -514,6 +600,7 @@ def api_engrams( offset: int = Query(0, ge=0), tag: str = Query(None), confirmed: bool = Query(None), + verdict: str = Query(None), search: str = Query(None), min_confidence: float = Query(0.0), ): @@ -527,9 +614,30 @@ def api_engrams( params.append(f'%"{tag}"%') if confirmed is not None: - where_clauses.append( - f"json_extract(correctness_json, '$.confirmed') = {int(confirmed)}" - ) + if confirmed: + # confirmed == statement is true (verdict confirmed_true) + where_clauses.append( + "(" + "json_extract(correctness_json, '$.verdict') = 'confirmed_true' " + "OR (json_extract(correctness_json, '$.verdict') IS NULL AND json_extract(correctness_json, '$.confirmed') = 1)" + ")" + ) + else: + # pending/unresolved (unknown/probable) but exclude confirmed_false. + where_clauses.append( + "(" + "json_extract(correctness_json, '$.verdict') IN ('unknown','probable_true','probable_false') " + "OR (json_extract(correctness_json, '$.verdict') IS NULL " + " AND json_extract(correctness_json, '$.confirmed') = 0 " + " AND COALESCE(json_extract(correctness_json, '$.rejections'), 0) = 0)" + ")" + ) + + if verdict: + v = verdict.strip() + if v in ("unknown", "probable_true", "probable_false", "confirmed_true", "confirmed_false"): + where_clauses.append("json_extract(correctness_json, '$.verdict') = ?") + params.append(v) if search: # Use FTS @@ -740,6 +848,8 @@ def api_create_engram(content: str = Form(...), tags: str = Form(""), source: st "hash": "", } correctness = { + "verdict": "unknown", + "evidence": [], "confirmed": False, "confirmations": 0, "rejections": 0, @@ -767,7 +877,12 @@ def api_pending(limit: int = Query(20, ge=1, le=100), offset: int = Query(0, ge= rows = c.execute( """ SELECT * FROM engrams - WHERE json_extract(correctness_json, '$.confirmed') = 0 + WHERE ( + json_extract(correctness_json, '$.verdict') IN ('unknown','probable_true','probable_false') + OR (json_extract(correctness_json, '$.verdict') IS NULL + AND json_extract(correctness_json, '$.confirmed') = 0 + AND COALESCE(json_extract(correctness_json, '$.rejections'), 0) = 0) + ) ORDER BY created_at DESC LIMIT ? OFFSET ? """, diff --git a/src/engram.py b/src/engram.py index 8eabbb8..8bd59e6 100644 --- a/src/engram.py +++ b/src/engram.py @@ -40,26 +40,60 @@ class ReviewEntry: @dataclass class Correctness: """Verfolgt die Korrektheit eines Engramms über Zeit.""" + # verdict model (not only binary confirm/reject) + # Values: + # - unknown + # - probable_true / probable_false + # - confirmed_true / confirmed_false + verdict: str = "unknown" + evidence: List[Dict[str, Any]] = field(default_factory=list) confirmed: bool = False confirmations: int = 0 rejections: int = 0 last_reviewed: Optional[str] = None review_history: List[ReviewEntry] = field(default_factory=list) + def is_final(self) -> bool: + return self.verdict in ("confirmed_true", "confirmed_false") + + def set_verdict(self, by: str, verdict: str, note: str = "", evidence: Optional[List[Dict[str, Any]]] = None) -> None: + verdict = (verdict or "").strip() + if verdict not in ("unknown", "probable_true", "probable_false", "confirmed_true", "confirmed_false"): + verdict = "unknown" + self.verdict = verdict + # Keep backward-compatible boolean in sync: + # historically, confirmed=True meant "this statement is correct". + self.confirmed = verdict == "confirmed_true" + self.last_reviewed = _now() + if evidence: + try: + self.evidence.extend([e for e in evidence if isinstance(e, dict)]) + except Exception: + pass + self.review_history.append(ReviewEntry(by, "set_verdict", self.last_reviewed, f"{verdict}: {note}".strip())) + def confirm(self, by: str, note: str = "") -> None: self.confirmations += 1 - self.confirmed = True - self.last_reviewed = _now() + self.set_verdict(by, "confirmed_true", note) + # Preserve historic action tag too self.review_history.append(ReviewEntry(by, "confirm", self.last_reviewed, note)) def reject(self, by: str, note: str = "") -> None: self.rejections += 1 - self.confirmed = False - self.last_reviewed = _now() + self.set_verdict(by, "confirmed_false", note) self.review_history.append(ReviewEntry(by, "reject", self.last_reviewed, note)) def score(self) -> float: """Confidence-Score aus Korrekturhistorie.""" + # verdict-first scoring (explicit, non-binary) + if self.verdict == "confirmed_true": + return 1.0 + if self.verdict == "confirmed_false": + return 0.0 + if self.verdict == "probable_true": + return 0.75 + if self.verdict == "probable_false": + return 0.25 total = self.confirmations + self.rejections if total == 0: return 0.5 # Unbestimmt @@ -74,6 +108,8 @@ class Correctness: else: review_history.append(entry.to_dict()) return { + "verdict": self.verdict, + "evidence": self.evidence, "confirmed": self.confirmed, "confirmations": self.confirmations, "rejections": self.rejections, @@ -84,11 +120,30 @@ class Correctness: @classmethod def from_dict(cls, d: dict) -> "Correctness": c = cls() + verdict = d.get("verdict") + if isinstance(verdict, str) and verdict.strip(): + c.verdict = verdict.strip() c.confirmed = d.get("confirmed", False) c.confirmations = d.get("confirmations", 0) c.rejections = d.get("rejections", 0) c.last_reviewed = d.get("last_reviewed") + ev = d.get("evidence", []) + if isinstance(ev, list): + c.evidence = [e for e in ev if isinstance(e, dict)] c.review_history = [ReviewEntry.from_dict(r) for r in d.get("review_history", [])] + # Backfill verdict if missing/invalid. + if c.verdict not in ("unknown", "probable_true", "probable_false", "confirmed_true", "confirmed_false"): + if c.confirmed: + c.verdict = "confirmed_true" + elif c.rejections > 0: + c.verdict = "confirmed_false" + else: + c.verdict = "unknown" + # Ensure boolean stays consistent for older mixed data. + if c.verdict == "confirmed_true": + c.confirmed = True + elif c.verdict == "confirmed_false": + c.confirmed = False return c diff --git a/static/style.css b/static/style.css index 6a3e8f3..f2efe55 100644 --- a/static/style.css +++ b/static/style.css @@ -124,6 +124,24 @@ body { color: #8a9aff; font-size: 0.72rem; } + +.verdict-pill{ + display:inline-block; + margin: 2px 6px 2px 0; + padding: 2px 8px; + border-radius: 999px; + font-size: 0.72rem; + font-weight: 800; + letter-spacing: 0.4px; + border: 1px solid #2a2a3a; + background: #1e1e28; + color: #cfd3ff; +} +.verdict-pill.v-true{ border-color:#2f6b3f; color:#aaf0b6; } +.verdict-pill.v-false{ border-color:#7a2c2c; color:#ffb3b3; } +.verdict-pill.v-prob-true{ border-color:#6c8af5; color:#cfd9ff; } +.verdict-pill.v-prob-false{ border-color:#b08a2a; color:#ffe2a3; } +.verdict-pill.v-unknown{ border-color:#3a3a55; color:#b9b9c9; } .muted { color: #888899; font-size: 0.8rem; diff --git a/templates/dashboard.html b/templates/dashboard.html index 8b1f378..4c3bd0e 100644 --- a/templates/dashboard.html +++ b/templates/dashboard.html @@ -29,6 +29,7 @@ + @@ -130,6 +131,7 @@ async function loadCards() { if (state.search) url += `&search=${encodeURIComponent(state.search)}`; if (state.filter === 'confirmed') url += '&confirmed=1'; if (state.filter === 'pending') url += '&confirmed=0'; + if (state.filter === 'rejected') url += '&verdict=confirmed_false'; if (state.filter === 'errors') url += '&tag=error'; const data = await api(url); @@ -347,6 +349,7 @@ function renderCards() {
${Math.round(item.confidence*100)}% + ${renderVerdictPill(item)} ${item.tags.map(t => ''+t+'').join('')} ${fmtDate(item.created)}
@@ -365,6 +368,19 @@ function renderCards() { `).join(''); } +function renderVerdictPill(item) { + const v = (item.verdict || '').toString(); + if (!v) return ''; + let cls = 'v-unknown'; + let label = v; + if (v === 'confirmed_true') { cls = 'v-true'; label = 'TRUE'; } + else if (v === 'confirmed_false') { cls = 'v-false'; label = 'FALSE'; } + else if (v === 'probable_true') { cls = 'v-prob-true'; label = 'LIKELY'; } + else if (v === 'probable_false') { cls = 'v-prob-false'; label = 'UNLIKELY'; } + else if (v === 'unknown') { cls = 'v-unknown'; label = 'UNKNOWN'; } + return `${label}`; +} + function fmtDate(iso) { const d = new Date(iso); return `${d.getDate().toString().padStart(2,'0')}.${(d.getMonth()+1).toString().padStart(2,'0')} ${d.getHours().toString().padStart(2,'0')}:${d.getMinutes().toString().padStart(2,'0')}`;