feat: add verdict/evidence verification model

2026-05-29 11:30:24 +02:00
parent f10a5b9f19
commit 6d99c520e6
5 changed files with 227 additions and 51 deletions
--- a/cron_tasks/verify_pending_external.py
+++ b/cron_tasks/verify_pending_external.py
@@ -78,7 +78,7 @@ def main() -> int:
    pending = [
        eg
        for eg in all_egs
-        if (not eg.correctness.confirmed and eg.correctness.rejections == 0)
+        if (not eg.correctness.is_final())
    ]
    confirmed = 0
@@ -94,16 +94,7 @@ def main() -> int:
        if src == "session" and (
            content.startswith("Session Summary (sess_") or content.startswith("Please remember ")
        ):
-            eg.correctness.rejections += 1
+            eg.correctness.reject("verify-pending", "Auto-reject: session placeholder")
            eg.correctness.last_reviewed = _now()
            eg.correctness.review_history.append(
                ReviewEntry(
                    by="verify-pending",
                    action="reject",
                    at=_now(),
                    note="Auto-reject: session placeholder",
                )
            )
            store.save(eg)
            rejected += 1
            continue
@@ -118,30 +109,11 @@ def main() -> int:
                still_pending += 1
                continue
            if 200 <= status < 300:
-                eg.correctness.confirmed = True
+                eg.correctness.confirm("verify-pending", f"Auto-confirm: web url ok ({status}) {url}")
                eg.correctness.confirmations += 1
                eg.correctness.last_reviewed = _now()
                eg.correctness.review_history.append(
                    ReviewEntry(
                        by="verify-pending",
                        action="confirm",
                        at=_now(),
                        note=f"Auto-confirm: web url ok ({status}) {url}",
                    )
                )
                store.save(eg)
                confirmed += 1
            else:
-                eg.correctness.rejections += 1
+                eg.correctness.reject("verify-pending", f"Auto-reject: web url status={status} {url}")
                eg.correctness.last_reviewed = _now()
                eg.correctness.review_history.append(
                    ReviewEntry(
                        by="verify-pending",
                        action="reject",
                        at=_now(),
                        note=f"Auto-reject: web url status={status} {url}",
                    )
                )
                store.save(eg)
                rejected += 1
            continue
--- a/fastapi_app.py
+++ b/fastapi_app.py
@@ -54,11 +54,22 @@ def get_db():
 def parse_engram(row: sqlite3.Row) -> dict:
    meta = json.loads(row["metadata_json"] or "{}")
    correctness = json.loads(row["correctness_json"] or "{}")
    verdict = correctness.get("verdict")
    if not isinstance(verdict, str) or not verdict:
        # Back-compat inference for older rows
        if correctness.get("confirmed", False):
            verdict = "confirmed_true"
        elif int(correctness.get("rejections", 0) or 0) > 0:
            verdict = "confirmed_false"
        else:
            verdict = "unknown"
    return {
        "id": row["id"],
        "content": row["content"],
        "confidence": meta.get("confidence", 0.0),
        "confirmed": correctness.get("confirmed", False),
        "verdict": verdict,
        "evidence": correctness.get("evidence", []),
        "confirmations": correctness.get("confirmations", 0),
        "rejections": correctness.get("rejections", 0),
        "tags": meta.get("tags", []),
@@ -88,6 +99,8 @@ def _update_correctness(engram_id: str, *, action: str, reason: str | None = Non
        raise FileNotFoundError(f"Engram not found: {engram_id}")
    corr = json.loads(row["correctness_json"] or "{}")
    corr.setdefault("verdict", None)
    corr.setdefault("evidence", [])
    corr.setdefault("confirmed", False)
    corr.setdefault("confirmations", 0)
    corr.setdefault("rejections", 0)
@@ -106,10 +119,30 @@ def _update_correctness(engram_id: str, *, action: str, reason: str | None = Non
        corr["review_history"] = [entry]
    if action == "confirm":
        corr["verdict"] = "confirmed_true"
        corr["confirmed"] = True
        corr["confirmations"] = int(corr.get("confirmations", 0) or 0) + 1
    elif action == "reject":
        corr["verdict"] = "confirmed_false"
        corr["rejections"] = int(corr.get("rejections", 0) or 0) + 1
        corr["confirmed"] = False
    # Store minimal evidence for dashboard-driven actions.
    try:
        ev = corr.get("evidence")
        if not isinstance(ev, list):
            ev = []
        ev.append(
            {
                "kind": "human",
                "by": "dashboard",
                "at": corr["last_reviewed"],
                "action": action,
            }
        )
        corr["evidence"] = ev[-50:]  # cap growth
    except Exception:
        pass
    c.execute(
        "UPDATE engrams SET correctness_json = ?, modified_at = ? WHERE id = ?",
@@ -232,8 +265,25 @@ def api_storage_stats():
    conn = get_db()
    c = conn.cursor()
    total = c.execute("SELECT COUNT(*) FROM engrams").fetchone()[0]
-    confirmed = c.execute(
+    confirmed_true = c.execute(
-        "SELECT COUNT(*) FROM engrams WHERE json_extract(correctness_json, '$.confirmed') = 1"
+        """
        SELECT COUNT(*) FROM engrams
        WHERE (
          json_extract(correctness_json, '$.verdict') = 'confirmed_true'
          OR (json_extract(correctness_json, '$.verdict') IS NULL AND json_extract(correctness_json, '$.confirmed') = 1)
        )
        """
    ).fetchone()[0]
    confirmed_false = c.execute(
        """
        SELECT COUNT(*) FROM engrams
        WHERE (
          json_extract(correctness_json, '$.verdict') = 'confirmed_false'
          OR (json_extract(correctness_json, '$.verdict') IS NULL
              AND json_extract(correctness_json, '$.confirmed') = 0
              AND COALESCE(json_extract(correctness_json, '$.rejections'), 0) > 0)
        )
        """
    ).fetchone()[0]
    sources = {
        r[0]: r[1]
@@ -268,8 +318,9 @@ def api_storage_stats():
    return {
        "sql": {
            "total_engrams": total,
-            "confirmed": confirmed,
+            "confirmed": confirmed_true,
-            "pending": total - confirmed,
+            "rejected": confirmed_false,
            "pending": total - confirmed_true - confirmed_false,
            "by_source": sources,
        },
        "vector": {
@@ -310,10 +361,27 @@ def api_insights(limit: int = Query(8, ge=1, le=50)):
        "SELECT id, metadata_json, correctness_json, created_at, modified_at FROM engrams ORDER BY created_at DESC LIMIT 2000"
    ).fetchall()
    total = c.execute("SELECT COUNT(*) FROM engrams").fetchone()[0]
-    confirmed = c.execute(
+    confirmed_true = c.execute(
-        "SELECT COUNT(*) FROM engrams WHERE json_extract(correctness_json, '$.confirmed') = 1"
+        """
        SELECT COUNT(*) FROM engrams
        WHERE (
          json_extract(correctness_json, '$.verdict') = 'confirmed_true'
          OR (json_extract(correctness_json, '$.verdict') IS NULL AND json_extract(correctness_json, '$.confirmed') = 1)
        )
        """
    ).fetchone()[0]
-    pending = total - confirmed
+    confirmed_false = c.execute(
        """
        SELECT COUNT(*) FROM engrams
        WHERE (
          json_extract(correctness_json, '$.verdict') = 'confirmed_false'
          OR (json_extract(correctness_json, '$.verdict') IS NULL
              AND json_extract(correctness_json, '$.confirmed') = 0
              AND COALESCE(json_extract(correctness_json, '$.rejections'), 0) > 0)
        )
        """
    ).fetchone()[0]
    pending = total - confirmed_true - confirmed_false
    tag_counts: dict[str, int] = {}
    source_counts: dict[str, int] = {}
@@ -488,10 +556,27 @@ def api_stats():
    conn = get_db()
    c = conn.cursor()
    total = c.execute("SELECT COUNT(*) FROM engrams").fetchone()[0]
-    confirmed = c.execute(
+    confirmed_true = c.execute(
-        "SELECT COUNT(*) FROM engrams WHERE json_extract(correctness_json, '$.confirmed') = 1"
+        """
        SELECT COUNT(*) FROM engrams
        WHERE (
          json_extract(correctness_json, '$.verdict') = 'confirmed_true'
          OR (json_extract(correctness_json, '$.verdict') IS NULL AND json_extract(correctness_json, '$.confirmed') = 1)
        )
        """
    ).fetchone()[0]
-    pending = total - confirmed
+    confirmed_false = c.execute(
        """
        SELECT COUNT(*) FROM engrams
        WHERE (
          json_extract(correctness_json, '$.verdict') = 'confirmed_false'
          OR (json_extract(correctness_json, '$.verdict') IS NULL
              AND json_extract(correctness_json, '$.confirmed') = 0
              AND COALESCE(json_extract(correctness_json, '$.rejections'), 0) > 0)
        )
        """
    ).fetchone()[0]
    pending = total - confirmed_true - confirmed_false
    errors = c.execute(
        "SELECT COUNT(*) FROM engrams WHERE json_extract(metadata_json, '$.tags') LIKE '%error%'"
    ).fetchone()[0]
@@ -501,7 +586,8 @@ def api_stats():
    conn.close()
    return {
        "total": total,
-        "confirmed": confirmed,
+        "confirmed": confirmed_true,
        "rejected": confirmed_false,
        "pending": pending,
        "errors": errors,
        "avg_confidence": round(avg_conf, 2),
@@ -514,6 +600,7 @@ def api_engrams(
    offset: int = Query(0, ge=0),
    tag: str = Query(None),
    confirmed: bool = Query(None),
    verdict: str = Query(None),
    search: str = Query(None),
    min_confidence: float = Query(0.0),
 ):
@@ -527,9 +614,30 @@ def api_engrams(
        params.append(f'%"{tag}"%')
    if confirmed is not None:
        if confirmed:
            # confirmed == statement is true (verdict confirmed_true)
            where_clauses.append(
-            f"json_extract(correctness_json, '$.confirmed') = {int(confirmed)}"
+                "("
                "json_extract(correctness_json, '$.verdict') = 'confirmed_true' "
                "OR (json_extract(correctness_json, '$.verdict') IS NULL AND json_extract(correctness_json, '$.confirmed') = 1)"
                ")"
            )
        else:
            # pending/unresolved (unknown/probable) but exclude confirmed_false.
            where_clauses.append(
                "("
                "json_extract(correctness_json, '$.verdict') IN ('unknown','probable_true','probable_false') "
                "OR (json_extract(correctness_json, '$.verdict') IS NULL "
                "    AND json_extract(correctness_json, '$.confirmed') = 0 "
                "    AND COALESCE(json_extract(correctness_json, '$.rejections'), 0) = 0)"
                ")"
            )
    if verdict:
        v = verdict.strip()
        if v in ("unknown", "probable_true", "probable_false", "confirmed_true", "confirmed_false"):
            where_clauses.append("json_extract(correctness_json, '$.verdict') = ?")
            params.append(v)
    if search:
        # Use FTS
@@ -740,6 +848,8 @@ def api_create_engram(content: str = Form(...), tags: str = Form(""), source: st
        "hash": "",
    }
    correctness = {
        "verdict": "unknown",
        "evidence": [],
        "confirmed": False,
        "confirmations": 0,
        "rejections": 0,
@@ -767,7 +877,12 @@ def api_pending(limit: int = Query(20, ge=1, le=100), offset: int = Query(0, ge=
    rows = c.execute(
        """
        SELECT * FROM engrams
-        WHERE json_extract(correctness_json, '$.confirmed') = 0
+        WHERE (
          json_extract(correctness_json, '$.verdict') IN ('unknown','probable_true','probable_false')
          OR (json_extract(correctness_json, '$.verdict') IS NULL
              AND json_extract(correctness_json, '$.confirmed') = 0
              AND COALESCE(json_extract(correctness_json, '$.rejections'), 0) = 0)
        )
        ORDER BY created_at DESC
        LIMIT ? OFFSET ?
        """,
--- a/src/engram.py
+++ b/src/engram.py
@@ -40,26 +40,60 @@ class ReviewEntry:
@dataclass
 class Correctness:
    """Verfolgt die Korrektheit eines Engramms über Zeit."""
    # verdict model (not only binary confirm/reject)
    # Values:
    # - unknown
    # - probable_true / probable_false
    # - confirmed_true / confirmed_false
    verdict: str = "unknown"
    evidence: List[Dict[str, Any]] = field(default_factory=list)
    confirmed: bool = False
    confirmations: int = 0
    rejections: int = 0
    last_reviewed: Optional[str] = None
    review_history: List[ReviewEntry] = field(default_factory=list)
    def is_final(self) -> bool:
        return self.verdict in ("confirmed_true", "confirmed_false")
    def set_verdict(self, by: str, verdict: str, note: str = "", evidence: Optional[List[Dict[str, Any]]] = None) -> None:
        verdict = (verdict or "").strip()
        if verdict not in ("unknown", "probable_true", "probable_false", "confirmed_true", "confirmed_false"):
            verdict = "unknown"
        self.verdict = verdict
        # Keep backward-compatible boolean in sync:
        # historically, confirmed=True meant "this statement is correct".
        self.confirmed = verdict == "confirmed_true"
        self.last_reviewed = _now()
        if evidence:
            try:
                self.evidence.extend([e for e in evidence if isinstance(e, dict)])
            except Exception:
                pass
        self.review_history.append(ReviewEntry(by, "set_verdict", self.last_reviewed, f"{verdict}: {note}".strip()))
    def confirm(self, by: str, note: str = "") -> None:
        self.confirmations += 1
-        self.confirmed = True
+        self.set_verdict(by, "confirmed_true", note)
-        self.last_reviewed = _now()
+        # Preserve historic action tag too
        self.review_history.append(ReviewEntry(by, "confirm", self.last_reviewed, note))
    def reject(self, by: str, note: str = "") -> None:
        self.rejections += 1
-        self.confirmed = False
+        self.set_verdict(by, "confirmed_false", note)
        self.last_reviewed = _now()
        self.review_history.append(ReviewEntry(by, "reject", self.last_reviewed, note))
    def score(self) -> float:
        """Confidence-Score aus Korrekturhistorie."""
        # verdict-first scoring (explicit, non-binary)
        if self.verdict == "confirmed_true":
            return 1.0
        if self.verdict == "confirmed_false":
            return 0.0
        if self.verdict == "probable_true":
            return 0.75
        if self.verdict == "probable_false":
            return 0.25
        total = self.confirmations + self.rejections
        if total == 0:
            return 0.5  # Unbestimmt
@@ -74,6 +108,8 @@ class Correctness:
            else:
                review_history.append(entry.to_dict())
        return {
            "verdict": self.verdict,
            "evidence": self.evidence,
            "confirmed": self.confirmed,
            "confirmations": self.confirmations,
            "rejections": self.rejections,
@@ -84,11 +120,30 @@ class Correctness:
    @classmethod
    def from_dict(cls, d: dict) -> "Correctness":
        c = cls()
        verdict = d.get("verdict")
        if isinstance(verdict, str) and verdict.strip():
            c.verdict = verdict.strip()
        c.confirmed = d.get("confirmed", False)
        c.confirmations = d.get("confirmations", 0)
        c.rejections = d.get("rejections", 0)
        c.last_reviewed = d.get("last_reviewed")
        ev = d.get("evidence", [])
        if isinstance(ev, list):
            c.evidence = [e for e in ev if isinstance(e, dict)]
        c.review_history = [ReviewEntry.from_dict(r) for r in d.get("review_history", [])]
        # Backfill verdict if missing/invalid.
        if c.verdict not in ("unknown", "probable_true", "probable_false", "confirmed_true", "confirmed_false"):
            if c.confirmed:
                c.verdict = "confirmed_true"
            elif c.rejections > 0:
                c.verdict = "confirmed_false"
            else:
                c.verdict = "unknown"
        # Ensure boolean stays consistent for older mixed data.
        if c.verdict == "confirmed_true":
            c.confirmed = True
        elif c.verdict == "confirmed_false":
            c.confirmed = False
        return c
--- a/static/style.css
+++ b/static/style.css
@@ -124,6 +124,24 @@ body {
    color: #8a9aff;
    font-size: 0.72rem;
 }
 .verdict-pill{
    display:inline-block;
    margin: 2px 6px 2px 0;
    padding: 2px 8px;
    border-radius: 999px;
    font-size: 0.72rem;
    font-weight: 800;
    letter-spacing: 0.4px;
    border: 1px solid #2a2a3a;
    background: #1e1e28;
    color: #cfd3ff;
 }
 .verdict-pill.v-true{ border-color:#2f6b3f; color:#aaf0b6; }
 .verdict-pill.v-false{ border-color:#7a2c2c; color:#ffb3b3; }
 .verdict-pill.v-prob-true{ border-color:#6c8af5; color:#cfd9ff; }
 .verdict-pill.v-prob-false{ border-color:#b08a2a; color:#ffe2a3; }
 .verdict-pill.v-unknown{ border-color:#3a3a55; color:#b9b9c9; }
 .muted {
    color: #888899;
    font-size: 0.8rem;
--- a/templates/dashboard.html
+++ b/templates/dashboard.html
@@ -29,6 +29,7 @@
                <option value="all">Alle</option>
                <option value="pending">Pending</option>
                <option value="confirmed">Confirmed</option>
                <option value="rejected">Rejected</option>
                <option value="errors">Errors</option>
            </select>
        </div>
@@ -130,6 +131,7 @@ async function loadCards() {
    if (state.search) url += `&search=${encodeURIComponent(state.search)}`;
    if (state.filter === 'confirmed') url += '&confirmed=1';
    if (state.filter === 'pending') url += '&confirmed=0';
    if (state.filter === 'rejected') url += '&verdict=confirmed_false';
    if (state.filter === 'errors') url += '&tag=error';
    const data = await api(url);
@@ -347,6 +349,7 @@ function renderCards() {
        <div class="card ${item.confirmed ? 'confirmed' : ''} ${item.rejections > 0 ? 'rejected' : ''}" data-id="${item.id}">
            <div class="card-header">
                <span class="conf-badge" style="background:hsl(${item.confidence*120},70%,40%)">${Math.round(item.confidence*100)}%</span>
                ${renderVerdictPill(item)}
                <span class="tags">${item.tags.map(t => '<span class="tag">'+t+'</span>').join('')}</span>
                <span class="date">${fmtDate(item.created)}</span>
            </div>
@@ -365,6 +368,19 @@ function renderCards() {
    `).join('');
 }
 function renderVerdictPill(item) {
    const v = (item.verdict || '').toString();
    if (!v) return '';
    let cls = 'v-unknown';
    let label = v;
    if (v === 'confirmed_true') { cls = 'v-true'; label = 'TRUE'; }
    else if (v === 'confirmed_false') { cls = 'v-false'; label = 'FALSE'; }
    else if (v === 'probable_true') { cls = 'v-prob-true'; label = 'LIKELY'; }
    else if (v === 'probable_false') { cls = 'v-prob-false'; label = 'UNLIKELY'; }
    else if (v === 'unknown') { cls = 'v-unknown'; label = 'UNKNOWN'; }
    return `<span class="verdict-pill ${cls}">${label}</span>`;
 }
 function fmtDate(iso) {
    const d = new Date(iso);
    return `${d.getDate().toString().padStart(2,'0')}.${(d.getMonth()+1).toString().padStart(2,'0')} ${d.getHours().toString().padStart(2,'0')}:${d.getMinutes().toString().padStart(2,'0')}`;