feat: add verdict/evidence verification model

This commit is contained in:
2026-05-29 11:30:24 +02:00
parent f10a5b9f19
commit 6d99c520e6
5 changed files with 227 additions and 51 deletions

View File

@@ -78,7 +78,7 @@ def main() -> int:
pending = [ pending = [
eg eg
for eg in all_egs for eg in all_egs
if (not eg.correctness.confirmed and eg.correctness.rejections == 0) if (not eg.correctness.is_final())
] ]
confirmed = 0 confirmed = 0
@@ -94,16 +94,7 @@ def main() -> int:
if src == "session" and ( if src == "session" and (
content.startswith("Session Summary (sess_") or content.startswith("Please remember ") content.startswith("Session Summary (sess_") or content.startswith("Please remember ")
): ):
eg.correctness.rejections += 1 eg.correctness.reject("verify-pending", "Auto-reject: session placeholder")
eg.correctness.last_reviewed = _now()
eg.correctness.review_history.append(
ReviewEntry(
by="verify-pending",
action="reject",
at=_now(),
note="Auto-reject: session placeholder",
)
)
store.save(eg) store.save(eg)
rejected += 1 rejected += 1
continue continue
@@ -118,30 +109,11 @@ def main() -> int:
still_pending += 1 still_pending += 1
continue continue
if 200 <= status < 300: if 200 <= status < 300:
eg.correctness.confirmed = True eg.correctness.confirm("verify-pending", f"Auto-confirm: web url ok ({status}) {url}")
eg.correctness.confirmations += 1
eg.correctness.last_reviewed = _now()
eg.correctness.review_history.append(
ReviewEntry(
by="verify-pending",
action="confirm",
at=_now(),
note=f"Auto-confirm: web url ok ({status}) {url}",
)
)
store.save(eg) store.save(eg)
confirmed += 1 confirmed += 1
else: else:
eg.correctness.rejections += 1 eg.correctness.reject("verify-pending", f"Auto-reject: web url status={status} {url}")
eg.correctness.last_reviewed = _now()
eg.correctness.review_history.append(
ReviewEntry(
by="verify-pending",
action="reject",
at=_now(),
note=f"Auto-reject: web url status={status} {url}",
)
)
store.save(eg) store.save(eg)
rejected += 1 rejected += 1
continue continue

View File

@@ -54,11 +54,22 @@ def get_db():
def parse_engram(row: sqlite3.Row) -> dict: def parse_engram(row: sqlite3.Row) -> dict:
meta = json.loads(row["metadata_json"] or "{}") meta = json.loads(row["metadata_json"] or "{}")
correctness = json.loads(row["correctness_json"] or "{}") correctness = json.loads(row["correctness_json"] or "{}")
verdict = correctness.get("verdict")
if not isinstance(verdict, str) or not verdict:
# Back-compat inference for older rows
if correctness.get("confirmed", False):
verdict = "confirmed_true"
elif int(correctness.get("rejections", 0) or 0) > 0:
verdict = "confirmed_false"
else:
verdict = "unknown"
return { return {
"id": row["id"], "id": row["id"],
"content": row["content"], "content": row["content"],
"confidence": meta.get("confidence", 0.0), "confidence": meta.get("confidence", 0.0),
"confirmed": correctness.get("confirmed", False), "confirmed": correctness.get("confirmed", False),
"verdict": verdict,
"evidence": correctness.get("evidence", []),
"confirmations": correctness.get("confirmations", 0), "confirmations": correctness.get("confirmations", 0),
"rejections": correctness.get("rejections", 0), "rejections": correctness.get("rejections", 0),
"tags": meta.get("tags", []), "tags": meta.get("tags", []),
@@ -88,6 +99,8 @@ def _update_correctness(engram_id: str, *, action: str, reason: str | None = Non
raise FileNotFoundError(f"Engram not found: {engram_id}") raise FileNotFoundError(f"Engram not found: {engram_id}")
corr = json.loads(row["correctness_json"] or "{}") corr = json.loads(row["correctness_json"] or "{}")
corr.setdefault("verdict", None)
corr.setdefault("evidence", [])
corr.setdefault("confirmed", False) corr.setdefault("confirmed", False)
corr.setdefault("confirmations", 0) corr.setdefault("confirmations", 0)
corr.setdefault("rejections", 0) corr.setdefault("rejections", 0)
@@ -106,10 +119,30 @@ def _update_correctness(engram_id: str, *, action: str, reason: str | None = Non
corr["review_history"] = [entry] corr["review_history"] = [entry]
if action == "confirm": if action == "confirm":
corr["verdict"] = "confirmed_true"
corr["confirmed"] = True corr["confirmed"] = True
corr["confirmations"] = int(corr.get("confirmations", 0) or 0) + 1 corr["confirmations"] = int(corr.get("confirmations", 0) or 0) + 1
elif action == "reject": elif action == "reject":
corr["verdict"] = "confirmed_false"
corr["rejections"] = int(corr.get("rejections", 0) or 0) + 1 corr["rejections"] = int(corr.get("rejections", 0) or 0) + 1
corr["confirmed"] = False
# Store minimal evidence for dashboard-driven actions.
try:
ev = corr.get("evidence")
if not isinstance(ev, list):
ev = []
ev.append(
{
"kind": "human",
"by": "dashboard",
"at": corr["last_reviewed"],
"action": action,
}
)
corr["evidence"] = ev[-50:] # cap growth
except Exception:
pass
c.execute( c.execute(
"UPDATE engrams SET correctness_json = ?, modified_at = ? WHERE id = ?", "UPDATE engrams SET correctness_json = ?, modified_at = ? WHERE id = ?",
@@ -232,8 +265,25 @@ def api_storage_stats():
conn = get_db() conn = get_db()
c = conn.cursor() c = conn.cursor()
total = c.execute("SELECT COUNT(*) FROM engrams").fetchone()[0] total = c.execute("SELECT COUNT(*) FROM engrams").fetchone()[0]
confirmed = c.execute( confirmed_true = c.execute(
"SELECT COUNT(*) FROM engrams WHERE json_extract(correctness_json, '$.confirmed') = 1" """
SELECT COUNT(*) FROM engrams
WHERE (
json_extract(correctness_json, '$.verdict') = 'confirmed_true'
OR (json_extract(correctness_json, '$.verdict') IS NULL AND json_extract(correctness_json, '$.confirmed') = 1)
)
"""
).fetchone()[0]
confirmed_false = c.execute(
"""
SELECT COUNT(*) FROM engrams
WHERE (
json_extract(correctness_json, '$.verdict') = 'confirmed_false'
OR (json_extract(correctness_json, '$.verdict') IS NULL
AND json_extract(correctness_json, '$.confirmed') = 0
AND COALESCE(json_extract(correctness_json, '$.rejections'), 0) > 0)
)
"""
).fetchone()[0] ).fetchone()[0]
sources = { sources = {
r[0]: r[1] r[0]: r[1]
@@ -268,8 +318,9 @@ def api_storage_stats():
return { return {
"sql": { "sql": {
"total_engrams": total, "total_engrams": total,
"confirmed": confirmed, "confirmed": confirmed_true,
"pending": total - confirmed, "rejected": confirmed_false,
"pending": total - confirmed_true - confirmed_false,
"by_source": sources, "by_source": sources,
}, },
"vector": { "vector": {
@@ -310,10 +361,27 @@ def api_insights(limit: int = Query(8, ge=1, le=50)):
"SELECT id, metadata_json, correctness_json, created_at, modified_at FROM engrams ORDER BY created_at DESC LIMIT 2000" "SELECT id, metadata_json, correctness_json, created_at, modified_at FROM engrams ORDER BY created_at DESC LIMIT 2000"
).fetchall() ).fetchall()
total = c.execute("SELECT COUNT(*) FROM engrams").fetchone()[0] total = c.execute("SELECT COUNT(*) FROM engrams").fetchone()[0]
confirmed = c.execute( confirmed_true = c.execute(
"SELECT COUNT(*) FROM engrams WHERE json_extract(correctness_json, '$.confirmed') = 1" """
SELECT COUNT(*) FROM engrams
WHERE (
json_extract(correctness_json, '$.verdict') = 'confirmed_true'
OR (json_extract(correctness_json, '$.verdict') IS NULL AND json_extract(correctness_json, '$.confirmed') = 1)
)
"""
).fetchone()[0] ).fetchone()[0]
pending = total - confirmed confirmed_false = c.execute(
"""
SELECT COUNT(*) FROM engrams
WHERE (
json_extract(correctness_json, '$.verdict') = 'confirmed_false'
OR (json_extract(correctness_json, '$.verdict') IS NULL
AND json_extract(correctness_json, '$.confirmed') = 0
AND COALESCE(json_extract(correctness_json, '$.rejections'), 0) > 0)
)
"""
).fetchone()[0]
pending = total - confirmed_true - confirmed_false
tag_counts: dict[str, int] = {} tag_counts: dict[str, int] = {}
source_counts: dict[str, int] = {} source_counts: dict[str, int] = {}
@@ -488,10 +556,27 @@ def api_stats():
conn = get_db() conn = get_db()
c = conn.cursor() c = conn.cursor()
total = c.execute("SELECT COUNT(*) FROM engrams").fetchone()[0] total = c.execute("SELECT COUNT(*) FROM engrams").fetchone()[0]
confirmed = c.execute( confirmed_true = c.execute(
"SELECT COUNT(*) FROM engrams WHERE json_extract(correctness_json, '$.confirmed') = 1" """
SELECT COUNT(*) FROM engrams
WHERE (
json_extract(correctness_json, '$.verdict') = 'confirmed_true'
OR (json_extract(correctness_json, '$.verdict') IS NULL AND json_extract(correctness_json, '$.confirmed') = 1)
)
"""
).fetchone()[0] ).fetchone()[0]
pending = total - confirmed confirmed_false = c.execute(
"""
SELECT COUNT(*) FROM engrams
WHERE (
json_extract(correctness_json, '$.verdict') = 'confirmed_false'
OR (json_extract(correctness_json, '$.verdict') IS NULL
AND json_extract(correctness_json, '$.confirmed') = 0
AND COALESCE(json_extract(correctness_json, '$.rejections'), 0) > 0)
)
"""
).fetchone()[0]
pending = total - confirmed_true - confirmed_false
errors = c.execute( errors = c.execute(
"SELECT COUNT(*) FROM engrams WHERE json_extract(metadata_json, '$.tags') LIKE '%error%'" "SELECT COUNT(*) FROM engrams WHERE json_extract(metadata_json, '$.tags') LIKE '%error%'"
).fetchone()[0] ).fetchone()[0]
@@ -501,7 +586,8 @@ def api_stats():
conn.close() conn.close()
return { return {
"total": total, "total": total,
"confirmed": confirmed, "confirmed": confirmed_true,
"rejected": confirmed_false,
"pending": pending, "pending": pending,
"errors": errors, "errors": errors,
"avg_confidence": round(avg_conf, 2), "avg_confidence": round(avg_conf, 2),
@@ -514,6 +600,7 @@ def api_engrams(
offset: int = Query(0, ge=0), offset: int = Query(0, ge=0),
tag: str = Query(None), tag: str = Query(None),
confirmed: bool = Query(None), confirmed: bool = Query(None),
verdict: str = Query(None),
search: str = Query(None), search: str = Query(None),
min_confidence: float = Query(0.0), min_confidence: float = Query(0.0),
): ):
@@ -527,9 +614,30 @@ def api_engrams(
params.append(f'%"{tag}"%') params.append(f'%"{tag}"%')
if confirmed is not None: if confirmed is not None:
if confirmed:
# confirmed == statement is true (verdict confirmed_true)
where_clauses.append( where_clauses.append(
f"json_extract(correctness_json, '$.confirmed') = {int(confirmed)}" "("
"json_extract(correctness_json, '$.verdict') = 'confirmed_true' "
"OR (json_extract(correctness_json, '$.verdict') IS NULL AND json_extract(correctness_json, '$.confirmed') = 1)"
")"
) )
else:
# pending/unresolved (unknown/probable) but exclude confirmed_false.
where_clauses.append(
"("
"json_extract(correctness_json, '$.verdict') IN ('unknown','probable_true','probable_false') "
"OR (json_extract(correctness_json, '$.verdict') IS NULL "
" AND json_extract(correctness_json, '$.confirmed') = 0 "
" AND COALESCE(json_extract(correctness_json, '$.rejections'), 0) = 0)"
")"
)
if verdict:
v = verdict.strip()
if v in ("unknown", "probable_true", "probable_false", "confirmed_true", "confirmed_false"):
where_clauses.append("json_extract(correctness_json, '$.verdict') = ?")
params.append(v)
if search: if search:
# Use FTS # Use FTS
@@ -740,6 +848,8 @@ def api_create_engram(content: str = Form(...), tags: str = Form(""), source: st
"hash": "", "hash": "",
} }
correctness = { correctness = {
"verdict": "unknown",
"evidence": [],
"confirmed": False, "confirmed": False,
"confirmations": 0, "confirmations": 0,
"rejections": 0, "rejections": 0,
@@ -767,7 +877,12 @@ def api_pending(limit: int = Query(20, ge=1, le=100), offset: int = Query(0, ge=
rows = c.execute( rows = c.execute(
""" """
SELECT * FROM engrams SELECT * FROM engrams
WHERE json_extract(correctness_json, '$.confirmed') = 0 WHERE (
json_extract(correctness_json, '$.verdict') IN ('unknown','probable_true','probable_false')
OR (json_extract(correctness_json, '$.verdict') IS NULL
AND json_extract(correctness_json, '$.confirmed') = 0
AND COALESCE(json_extract(correctness_json, '$.rejections'), 0) = 0)
)
ORDER BY created_at DESC ORDER BY created_at DESC
LIMIT ? OFFSET ? LIMIT ? OFFSET ?
""", """,

View File

@@ -40,26 +40,60 @@ class ReviewEntry:
@dataclass @dataclass
class Correctness: class Correctness:
"""Verfolgt die Korrektheit eines Engramms über Zeit.""" """Verfolgt die Korrektheit eines Engramms über Zeit."""
# verdict model (not only binary confirm/reject)
# Values:
# - unknown
# - probable_true / probable_false
# - confirmed_true / confirmed_false
verdict: str = "unknown"
evidence: List[Dict[str, Any]] = field(default_factory=list)
confirmed: bool = False confirmed: bool = False
confirmations: int = 0 confirmations: int = 0
rejections: int = 0 rejections: int = 0
last_reviewed: Optional[str] = None last_reviewed: Optional[str] = None
review_history: List[ReviewEntry] = field(default_factory=list) review_history: List[ReviewEntry] = field(default_factory=list)
def is_final(self) -> bool:
return self.verdict in ("confirmed_true", "confirmed_false")
def set_verdict(self, by: str, verdict: str, note: str = "", evidence: Optional[List[Dict[str, Any]]] = None) -> None:
verdict = (verdict or "").strip()
if verdict not in ("unknown", "probable_true", "probable_false", "confirmed_true", "confirmed_false"):
verdict = "unknown"
self.verdict = verdict
# Keep backward-compatible boolean in sync:
# historically, confirmed=True meant "this statement is correct".
self.confirmed = verdict == "confirmed_true"
self.last_reviewed = _now()
if evidence:
try:
self.evidence.extend([e for e in evidence if isinstance(e, dict)])
except Exception:
pass
self.review_history.append(ReviewEntry(by, "set_verdict", self.last_reviewed, f"{verdict}: {note}".strip()))
def confirm(self, by: str, note: str = "") -> None: def confirm(self, by: str, note: str = "") -> None:
self.confirmations += 1 self.confirmations += 1
self.confirmed = True self.set_verdict(by, "confirmed_true", note)
self.last_reviewed = _now() # Preserve historic action tag too
self.review_history.append(ReviewEntry(by, "confirm", self.last_reviewed, note)) self.review_history.append(ReviewEntry(by, "confirm", self.last_reviewed, note))
def reject(self, by: str, note: str = "") -> None: def reject(self, by: str, note: str = "") -> None:
self.rejections += 1 self.rejections += 1
self.confirmed = False self.set_verdict(by, "confirmed_false", note)
self.last_reviewed = _now()
self.review_history.append(ReviewEntry(by, "reject", self.last_reviewed, note)) self.review_history.append(ReviewEntry(by, "reject", self.last_reviewed, note))
def score(self) -> float: def score(self) -> float:
"""Confidence-Score aus Korrekturhistorie.""" """Confidence-Score aus Korrekturhistorie."""
# verdict-first scoring (explicit, non-binary)
if self.verdict == "confirmed_true":
return 1.0
if self.verdict == "confirmed_false":
return 0.0
if self.verdict == "probable_true":
return 0.75
if self.verdict == "probable_false":
return 0.25
total = self.confirmations + self.rejections total = self.confirmations + self.rejections
if total == 0: if total == 0:
return 0.5 # Unbestimmt return 0.5 # Unbestimmt
@@ -74,6 +108,8 @@ class Correctness:
else: else:
review_history.append(entry.to_dict()) review_history.append(entry.to_dict())
return { return {
"verdict": self.verdict,
"evidence": self.evidence,
"confirmed": self.confirmed, "confirmed": self.confirmed,
"confirmations": self.confirmations, "confirmations": self.confirmations,
"rejections": self.rejections, "rejections": self.rejections,
@@ -84,11 +120,30 @@ class Correctness:
@classmethod @classmethod
def from_dict(cls, d: dict) -> "Correctness": def from_dict(cls, d: dict) -> "Correctness":
c = cls() c = cls()
verdict = d.get("verdict")
if isinstance(verdict, str) and verdict.strip():
c.verdict = verdict.strip()
c.confirmed = d.get("confirmed", False) c.confirmed = d.get("confirmed", False)
c.confirmations = d.get("confirmations", 0) c.confirmations = d.get("confirmations", 0)
c.rejections = d.get("rejections", 0) c.rejections = d.get("rejections", 0)
c.last_reviewed = d.get("last_reviewed") c.last_reviewed = d.get("last_reviewed")
ev = d.get("evidence", [])
if isinstance(ev, list):
c.evidence = [e for e in ev if isinstance(e, dict)]
c.review_history = [ReviewEntry.from_dict(r) for r in d.get("review_history", [])] c.review_history = [ReviewEntry.from_dict(r) for r in d.get("review_history", [])]
# Backfill verdict if missing/invalid.
if c.verdict not in ("unknown", "probable_true", "probable_false", "confirmed_true", "confirmed_false"):
if c.confirmed:
c.verdict = "confirmed_true"
elif c.rejections > 0:
c.verdict = "confirmed_false"
else:
c.verdict = "unknown"
# Ensure boolean stays consistent for older mixed data.
if c.verdict == "confirmed_true":
c.confirmed = True
elif c.verdict == "confirmed_false":
c.confirmed = False
return c return c

View File

@@ -124,6 +124,24 @@ body {
color: #8a9aff; color: #8a9aff;
font-size: 0.72rem; font-size: 0.72rem;
} }
.verdict-pill{
display:inline-block;
margin: 2px 6px 2px 0;
padding: 2px 8px;
border-radius: 999px;
font-size: 0.72rem;
font-weight: 800;
letter-spacing: 0.4px;
border: 1px solid #2a2a3a;
background: #1e1e28;
color: #cfd3ff;
}
.verdict-pill.v-true{ border-color:#2f6b3f; color:#aaf0b6; }
.verdict-pill.v-false{ border-color:#7a2c2c; color:#ffb3b3; }
.verdict-pill.v-prob-true{ border-color:#6c8af5; color:#cfd9ff; }
.verdict-pill.v-prob-false{ border-color:#b08a2a; color:#ffe2a3; }
.verdict-pill.v-unknown{ border-color:#3a3a55; color:#b9b9c9; }
.muted { .muted {
color: #888899; color: #888899;
font-size: 0.8rem; font-size: 0.8rem;

View File

@@ -29,6 +29,7 @@
<option value="all">Alle</option> <option value="all">Alle</option>
<option value="pending">Pending</option> <option value="pending">Pending</option>
<option value="confirmed">Confirmed</option> <option value="confirmed">Confirmed</option>
<option value="rejected">Rejected</option>
<option value="errors">Errors</option> <option value="errors">Errors</option>
</select> </select>
</div> </div>
@@ -130,6 +131,7 @@ async function loadCards() {
if (state.search) url += `&search=${encodeURIComponent(state.search)}`; if (state.search) url += `&search=${encodeURIComponent(state.search)}`;
if (state.filter === 'confirmed') url += '&confirmed=1'; if (state.filter === 'confirmed') url += '&confirmed=1';
if (state.filter === 'pending') url += '&confirmed=0'; if (state.filter === 'pending') url += '&confirmed=0';
if (state.filter === 'rejected') url += '&verdict=confirmed_false';
if (state.filter === 'errors') url += '&tag=error'; if (state.filter === 'errors') url += '&tag=error';
const data = await api(url); const data = await api(url);
@@ -347,6 +349,7 @@ function renderCards() {
<div class="card ${item.confirmed ? 'confirmed' : ''} ${item.rejections > 0 ? 'rejected' : ''}" data-id="${item.id}"> <div class="card ${item.confirmed ? 'confirmed' : ''} ${item.rejections > 0 ? 'rejected' : ''}" data-id="${item.id}">
<div class="card-header"> <div class="card-header">
<span class="conf-badge" style="background:hsl(${item.confidence*120},70%,40%)">${Math.round(item.confidence*100)}%</span> <span class="conf-badge" style="background:hsl(${item.confidence*120},70%,40%)">${Math.round(item.confidence*100)}%</span>
${renderVerdictPill(item)}
<span class="tags">${item.tags.map(t => '<span class="tag">'+t+'</span>').join('')}</span> <span class="tags">${item.tags.map(t => '<span class="tag">'+t+'</span>').join('')}</span>
<span class="date">${fmtDate(item.created)}</span> <span class="date">${fmtDate(item.created)}</span>
</div> </div>
@@ -365,6 +368,19 @@ function renderCards() {
`).join(''); `).join('');
} }
function renderVerdictPill(item) {
const v = (item.verdict || '').toString();
if (!v) return '';
let cls = 'v-unknown';
let label = v;
if (v === 'confirmed_true') { cls = 'v-true'; label = 'TRUE'; }
else if (v === 'confirmed_false') { cls = 'v-false'; label = 'FALSE'; }
else if (v === 'probable_true') { cls = 'v-prob-true'; label = 'LIKELY'; }
else if (v === 'probable_false') { cls = 'v-prob-false'; label = 'UNLIKELY'; }
else if (v === 'unknown') { cls = 'v-unknown'; label = 'UNKNOWN'; }
return `<span class="verdict-pill ${cls}">${label}</span>`;
}
function fmtDate(iso) { function fmtDate(iso) {
const d = new Date(iso); const d = new Date(iso);
return `${d.getDate().toString().padStart(2,'0')}.${(d.getMonth()+1).toString().padStart(2,'0')} ${d.getHours().toString().padStart(2,'0')}:${d.getMinutes().toString().padStart(2,'0')}`; return `${d.getDate().toString().padStart(2,'0')}.${(d.getMonth()+1).toString().padStart(2,'0')} ${d.getHours().toString().padStart(2,'0')}:${d.getMinutes().toString().padStart(2,'0')}`;