chore: sync local workspace state

2026-05-30 00:38:57 +02:00
parent 20098a3253
commit e6e8eba8f6
8 changed files with 5626 additions and 68 deletions
--- a/scripts/import_web_design_markers.py
+++ b/scripts/import_web_design_markers.py
@@ -0,0 +1,191 @@
+#!/usr/bin/env python3
+import argparse
+import json
+import hashlib
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any, Dict, Iterable, List, Optional, Tuple
+
+from src.engram import Engram, Grounding
+from src.store import EngramStore
+
+
+def _now_utc_iso() -> str:
+    return datetime.now(timezone.utc).isoformat()
+
+
+def _hash16(text: str) -> str:
+    return hashlib.sha256(text.encode("utf-8")).hexdigest()[:16]
+
+
+def _iter_jsonl(path: Path) -> Iterable[Dict[str, Any]]:
+    with path.open("r", encoding="utf-8") as f:
+        for line_no, line in enumerate(f, start=1):
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                obj = json.loads(line)
+            except Exception:
+                raise SystemExit(f"Invalid JSON at {path}:{line_no}")
+            if not isinstance(obj, dict):
+                continue
+            yield obj
+
+
+def _marker_to_content(marker_obj: Dict[str, Any]) -> Tuple[str, List[Dict[str, Any]]]:
+    marker = str(marker_obj.get("marker", "")).strip()
+    details = str(marker_obj.get("details", "")).strip()
+    checks = marker_obj.get("checks") or []
+    sources = marker_obj.get("sources") or []
+
+    if not marker:
+        raise ValueError("missing marker")
+
+    evidence: List[Dict[str, Any]] = []
+    for src in sources:
+        if not isinstance(src, dict):
+            continue
+        url = (src.get("url") or "").strip()
+        title = (src.get("title") or "").strip()
+        if not url:
+            continue
+        evidence.append({"url": url, "title": title})
+
+    lines: List[str] = []
+    lines.append(f"WEBDEV_MARKER: {marker}")
+    if details:
+        lines.append("")
+        lines.append(f"Details: {details}")
+    if isinstance(checks, list) and checks:
+        lines.append("")
+        lines.append("Checks:")
+        for c in checks[:8]:
+            c = str(c).strip()
+            if c:
+                lines.append(f"- {c}")
+    if evidence:
+        lines.append("")
+        lines.append("Sources:")
+        for ev in evidence[:12]:
+            title = (ev.get("title") or "").strip()
+            url = (ev.get("url") or "").strip()
+            if title:
+                lines.append(f"- {title}: {url}")
+            else:
+                lines.append(f"- {url}")
+    return "\n".join(lines).strip(), evidence
+
+
+def _tags_for(marker_obj: Dict[str, Any]) -> List[str]:
+    tags = ["web_design", "web_development", "mobile"]
+    area = str(marker_obj.get("area", "")).strip()
+    if area:
+        tags.append(area)
+    return tags
+
+
+def import_markers(
+    db_path: Path,
+    jsonl_paths: List[Path],
+    source: str,
+    verdict: str,
+    agent_id: str,
+    dry_run: bool,
+) -> Dict[str, int]:
+    store = EngramStore(str(db_path))
+
+    stats = {"seen": 0, "imported": 0, "skipped_dup": 0, "skipped_invalid": 0}
+    seen_hashes: set[str] = set()
+
+    # Preload existing hashes (fast-ish; avoids duplicate spam).
+    existing_hashes: set[str] = set()
+    try:
+        cur = store._conn.execute("SELECT metadata_json FROM engrams")  # noqa: SLF001
+        for row in cur.fetchall():
+            try:
+                meta = json.loads(row["metadata_json"])
+                h = meta.get("hash")
+                if isinstance(h, str) and h:
+                    existing_hashes.add(h)
+            except Exception:
+                continue
+    except Exception:
+        # If this fails (schema mismatch), proceed without preload.
+        existing_hashes = set()
+
+    for path in jsonl_paths:
+        for marker_obj in _iter_jsonl(path):
+            if (marker_obj.get("kind") or "") != "web_design_marker":
+                continue
+            stats["seen"] += 1
+            try:
+                content, evidence = _marker_to_content(marker_obj)
+            except Exception:
+                stats["skipped_invalid"] += 1
+                continue
+
+            h = _hash16(content)
+            if h in seen_hashes or h in existing_hashes:
+                stats["skipped_dup"] += 1
+                continue
+            seen_hashes.add(h)
+
+            eg = Engram.create(
+                content=content,
+                source=source,
+                confidence=0.75,
+                tags=_tags_for(marker_obj),
+                session_id=None,
+                agent_id=agent_id or str(marker_obj.get("agent_id") or ""),
+                grounding=Grounding.SOURCED,
+            )
+            # Overwrite hash to exactly match our content representation.
+            eg.metadata["hash"] = h
+            eg.metadata["modified"] = _now_utc_iso()
+            eg.metadata["created"] = marker_obj.get("created_at") or eg.metadata["created"]
+
+            eg.correctness.set_verdict(
+                by=agent_id or "importer",
+                verdict=verdict,
+                note=f"Imported from {path.name}",
+                evidence=evidence,
+            )
+
+            if not dry_run:
+                store.save(eg)
+            stats["imported"] += 1
+
+    return stats
+
+
+def main() -> None:
+    p = argparse.ArgumentParser(description="Import web_design_marker JSONL files into brain.sqlite")
+    p.add_argument("--db", default="second-brain/data/brain.sqlite", help="Path to brain.sqlite")
+    p.add_argument("--glob", default="/tmp/web_design_markers_*.jsonl", help="Glob for marker JSONL files")
+    p.add_argument("--source", default="web_research", help="Engram source")
+    p.add_argument("--verdict", default="probable_true", help="Correctness verdict")
+    p.add_argument("--agent-id", default="web_research_import", help="Agent id to record")
+    p.add_argument("--dry-run", action="store_true", help="Parse/dedupe but do not write to DB")
+    args = p.parse_args()
+
+    db_path = Path(args.db)
+    jsonl_paths = sorted(Path("/").glob(args.glob.lstrip("/"))) if args.glob.startswith("/") else sorted(Path(".").glob(args.glob))
+    if not jsonl_paths:
+        raise SystemExit(f"No files match glob: {args.glob}")
+
+    stats = import_markers(
+        db_path=db_path,
+        jsonl_paths=jsonl_paths,
+        source=args.source,
+        verdict=args.verdict,
+        agent_id=args.agent_id,
+        dry_run=bool(args.dry_run),
+    )
+
+    print(json.dumps({"db": str(db_path), "files": [str(p) for p in jsonl_paths], "stats": stats}, ensure_ascii=False, indent=2))
+
+
+if __name__ == "__main__":
+    main()
+
--- a/scripts/process_pending_engrams.py
+++ b/scripts/process_pending_engrams.py
@@ -0,0 +1,163 @@
+#!/usr/bin/env python3
+"""
+Process pending second brain engrams.
+- For unconfirmed, unrejected engrams: evaluate confidence
+- If confidence > 0.8: confirm
+- If confidence < 0.3: reject
+- Otherwise: mark for review (leave as is)
+- Check for stale topics and archive if needed
+- Produce summary report
+"""
+
+import sys
+import json
+from datetime import datetime, timezone
+from pathlib import Path
+
+# Add src to path and set PYTHONPATH for proper module resolution
+base_dir = Path(__file__).parent.parent
+sys.path.insert(0, str(base_dir / "src"))
+
+# Import using absolute module paths
+from src.store import EngramStore
+from src.engram import Engram, Grounding
+
+DB_PATH = Path(__file__).parent.parent / "data" / "brain.sqlite"
+
+
+def is_stale(engram: Engram, days_threshold: int = 90) -> bool:
+    """Check if an engram is stale (old and rarely accessed)."""
+    created = engram.metadata.get("created", "")
+    access_count = engram.metadata.get("access_count", 0)
+    last_accessed = engram.metadata.get("last_accessed", created)
+
+    try:
+        created_dt = datetime.fromisoformat(created.replace("Z", "+00:00"))
+        last_accessed_dt = datetime.fromisoformat(last_accessed.replace("Z", "+00:00"))
+        age_days = (datetime.now(timezone.utc) - created_dt).total_seconds() / 86400
+        days_since_access = (datetime.now(timezone.utc) - last_accessed_dt).total_seconds() / 86400
+
+        # Stale if: old (>90 days) AND rarely accessed (<3 times) AND not accessed recently (>60 days)
+        if age_days > days_threshold and access_count < 3 and days_since_access > 60:
+            return True
+    except Exception:
+        pass
+
+    return False
+
+
+def process_pending_engrams():
+    """Main processing function."""
+    store = EngramStore(str(DB_PATH))
+
+    # Get all engrams
+    all_engrams = store.get_all(limit=10000)
+    print(f"Total engrams in database: {len(all_engrams)}")
+
+    # Filter pending (unconfirmed and unrejected)
+    # Unconfirmed: not confirmed_true, not confirmed_false
+    pending = []
+    for eg in all_engrams:
+        verdict = eg.correctness.verdict
+        if verdict not in ("confirmed_true", "confirmed_false"):
+            pending.append(eg)
+
+    print(f"Pending engrams (unconfirmed/unrejected): {len(pending)}")
+
+    actions = {
+        "confirmed": 0,
+        "rejected": 0,
+        "left_for_review": 0,
+        "archived_stale": 0,
+        "errors": 0
+    }
+
+    details = []
+
+    for eg in pending:
+        try:
+            confidence = eg.compute_confidence()
+            engram_id = str(eg.id)
+            content_preview = eg.content[:80] + ("..." if len(eg.content) > 80 else "")
+
+            # Check if stale and should be archived
+            if is_stale(eg):
+                # For stale engrams, we'll mark them in metadata for archiving
+                # Instead of deleting, we'll add an "archived" tag and lower their priority
+                tags = eg.metadata.get("tags", [])
+                if "archived" not in tags:
+                    tags.append("archived")
+                    eg.metadata["tags"] = tags
+                    eg.metadata["archived_at"] = datetime.now(timezone.utc).isoformat()
+                    store.save(eg)
+                    actions["archived_stale"] += 1
+                    details.append(f"📦 Archived stale: [{engram_id[:8]}] {content_preview} (conf: {confidence:.2f})")
+                # Even if stale, we still evaluate confidence for reporting
+                # But we don't confirm/reject stale ones automatically unless confidence is extreme
+                # Actually, the task says to check for stale topics and archive if needed. We've done that.
+                # We still need to apply confidence thresholds to non-stale or all pending?
+                # Let's continue to evaluate all pending, including stale, but maybe skip confirm/reject for stale?
+                # The task: "For each pending engram... evaluate... If >0.8 confirm, <0.3 reject, otherwise mark for review"
+                # It doesn't say to skip stale ones. So we'll still apply thresholds.
+                # But we already archived it. We can still confirm/reject it if confidence is extreme.
+                # Let's continue.
+
+            # Apply confidence thresholds
+            if confidence > 0.8:
+                eg.correctness.confirm(by="auto_processor", note=f"Auto-confirmed: confidence {confidence:.2f}")
+                store.save(eg)
+                actions["confirmed"] += 1
+                details.append(f"✅ Confirmed: [{engram_id[:8]}] {content_preview} (conf: {confidence:.2f})")
+            elif confidence < 0.3:
+                eg.correctness.reject(by="auto_processor", note=f"Auto-rejected: confidence {confidence:.2f}")
+                store.save(eg)
+                actions["rejected"] += 1
+                details.append(f"❌ Rejected: [{engram_id[:8]}] {content_preview} (conf: {confidence:.2f})")
+            else:
+                actions["left_for_review"] += 1
+                details.append(f"⏳ Review later: [{engram_id[:8]}] {content_preview} (conf: {confidence:.2f})")
+
+        except Exception as e:
+            actions["errors"] += 1
+            details.append(f"⚠️ Error processing engram: {str(e)}")
+
+    # Generate summary report
+    report_lines = []
+    report_lines.append("=" * 60)
+    report_lines.append("PENDING ENGRAMS PROCESSING REPORT")
+    report_lines.append("=" * 60)
+    report_lines.append(f"Timestamp: {datetime.now(timezone.utc).isoformat()}")
+    report_lines.append(f"Total engrams: {len(all_engrams)}")
+    report_lines.append(f"Pending engrams processed: {len(pending)}")
+    report_lines.append("")
+    report_lines.append("ACTIONS TAKEN:")
+    report_lines.append(f"  ✅ Auto-confirmed (confidence > 0.8): {actions['confirmed']}")
+    report_lines.append(f"  ❌ Auto-rejected (confidence < 0.3): {actions['rejected']}")
+    report_lines.append(f"  ⏳ Left for review (0.3 ≤ confidence ≤ 0.8): {actions['left_for_review']}")
+    report_lines.append(f"  📦 Archived stale topics: {actions['archived_stale']}")
+    report_lines.append(f"  ⚠️ Errors: {actions['errors']}")
+    report_lines.append("")
+    report_lines.append("DETAILS:")
+    report_lines.extend(details)
+    report_lines.append("")
+    report_lines.append("=" * 60)
+
+    report = "\n".join(report_lines)
+
+    # Print to stdout
+    print("\n" + report)
+
+    # Save report to file
+    report_dir = Path(__file__).parent.parent / "reports"
+    report_dir.mkdir(parents=True, exist_ok=True)
+    report_file = report_dir / f"pending_engrams_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt"
+    report_file.write_text(report, encoding="utf-8")
+    print(f"\n📄 Report saved to: {report_file}")
+
+    store.close()
+    return actions
+
+
+if __name__ == "__main__":
+    result = process_pending_engrams()
+    print("\nProcessing complete.")