chore: sync local workspace state
This commit is contained in:
191
scripts/import_web_design_markers.py
Normal file
191
scripts/import_web_design_markers.py
Normal file
@@ -0,0 +1,191 @@
|
||||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
import json
|
||||
import hashlib
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Iterable, List, Optional, Tuple
|
||||
|
||||
from src.engram import Engram, Grounding
|
||||
from src.store import EngramStore
|
||||
|
||||
|
||||
def _now_utc_iso() -> str:
|
||||
return datetime.now(timezone.utc).isoformat()
|
||||
|
||||
|
||||
def _hash16(text: str) -> str:
|
||||
return hashlib.sha256(text.encode("utf-8")).hexdigest()[:16]
|
||||
|
||||
|
||||
def _iter_jsonl(path: Path) -> Iterable[Dict[str, Any]]:
|
||||
with path.open("r", encoding="utf-8") as f:
|
||||
for line_no, line in enumerate(f, start=1):
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
try:
|
||||
obj = json.loads(line)
|
||||
except Exception:
|
||||
raise SystemExit(f"Invalid JSON at {path}:{line_no}")
|
||||
if not isinstance(obj, dict):
|
||||
continue
|
||||
yield obj
|
||||
|
||||
|
||||
def _marker_to_content(marker_obj: Dict[str, Any]) -> Tuple[str, List[Dict[str, Any]]]:
|
||||
marker = str(marker_obj.get("marker", "")).strip()
|
||||
details = str(marker_obj.get("details", "")).strip()
|
||||
checks = marker_obj.get("checks") or []
|
||||
sources = marker_obj.get("sources") or []
|
||||
|
||||
if not marker:
|
||||
raise ValueError("missing marker")
|
||||
|
||||
evidence: List[Dict[str, Any]] = []
|
||||
for src in sources:
|
||||
if not isinstance(src, dict):
|
||||
continue
|
||||
url = (src.get("url") or "").strip()
|
||||
title = (src.get("title") or "").strip()
|
||||
if not url:
|
||||
continue
|
||||
evidence.append({"url": url, "title": title})
|
||||
|
||||
lines: List[str] = []
|
||||
lines.append(f"WEBDEV_MARKER: {marker}")
|
||||
if details:
|
||||
lines.append("")
|
||||
lines.append(f"Details: {details}")
|
||||
if isinstance(checks, list) and checks:
|
||||
lines.append("")
|
||||
lines.append("Checks:")
|
||||
for c in checks[:8]:
|
||||
c = str(c).strip()
|
||||
if c:
|
||||
lines.append(f"- {c}")
|
||||
if evidence:
|
||||
lines.append("")
|
||||
lines.append("Sources:")
|
||||
for ev in evidence[:12]:
|
||||
title = (ev.get("title") or "").strip()
|
||||
url = (ev.get("url") or "").strip()
|
||||
if title:
|
||||
lines.append(f"- {title}: {url}")
|
||||
else:
|
||||
lines.append(f"- {url}")
|
||||
return "\n".join(lines).strip(), evidence
|
||||
|
||||
|
||||
def _tags_for(marker_obj: Dict[str, Any]) -> List[str]:
|
||||
tags = ["web_design", "web_development", "mobile"]
|
||||
area = str(marker_obj.get("area", "")).strip()
|
||||
if area:
|
||||
tags.append(area)
|
||||
return tags
|
||||
|
||||
|
||||
def import_markers(
|
||||
db_path: Path,
|
||||
jsonl_paths: List[Path],
|
||||
source: str,
|
||||
verdict: str,
|
||||
agent_id: str,
|
||||
dry_run: bool,
|
||||
) -> Dict[str, int]:
|
||||
store = EngramStore(str(db_path))
|
||||
|
||||
stats = {"seen": 0, "imported": 0, "skipped_dup": 0, "skipped_invalid": 0}
|
||||
seen_hashes: set[str] = set()
|
||||
|
||||
# Preload existing hashes (fast-ish; avoids duplicate spam).
|
||||
existing_hashes: set[str] = set()
|
||||
try:
|
||||
cur = store._conn.execute("SELECT metadata_json FROM engrams") # noqa: SLF001
|
||||
for row in cur.fetchall():
|
||||
try:
|
||||
meta = json.loads(row["metadata_json"])
|
||||
h = meta.get("hash")
|
||||
if isinstance(h, str) and h:
|
||||
existing_hashes.add(h)
|
||||
except Exception:
|
||||
continue
|
||||
except Exception:
|
||||
# If this fails (schema mismatch), proceed without preload.
|
||||
existing_hashes = set()
|
||||
|
||||
for path in jsonl_paths:
|
||||
for marker_obj in _iter_jsonl(path):
|
||||
if (marker_obj.get("kind") or "") != "web_design_marker":
|
||||
continue
|
||||
stats["seen"] += 1
|
||||
try:
|
||||
content, evidence = _marker_to_content(marker_obj)
|
||||
except Exception:
|
||||
stats["skipped_invalid"] += 1
|
||||
continue
|
||||
|
||||
h = _hash16(content)
|
||||
if h in seen_hashes or h in existing_hashes:
|
||||
stats["skipped_dup"] += 1
|
||||
continue
|
||||
seen_hashes.add(h)
|
||||
|
||||
eg = Engram.create(
|
||||
content=content,
|
||||
source=source,
|
||||
confidence=0.75,
|
||||
tags=_tags_for(marker_obj),
|
||||
session_id=None,
|
||||
agent_id=agent_id or str(marker_obj.get("agent_id") or ""),
|
||||
grounding=Grounding.SOURCED,
|
||||
)
|
||||
# Overwrite hash to exactly match our content representation.
|
||||
eg.metadata["hash"] = h
|
||||
eg.metadata["modified"] = _now_utc_iso()
|
||||
eg.metadata["created"] = marker_obj.get("created_at") or eg.metadata["created"]
|
||||
|
||||
eg.correctness.set_verdict(
|
||||
by=agent_id or "importer",
|
||||
verdict=verdict,
|
||||
note=f"Imported from {path.name}",
|
||||
evidence=evidence,
|
||||
)
|
||||
|
||||
if not dry_run:
|
||||
store.save(eg)
|
||||
stats["imported"] += 1
|
||||
|
||||
return stats
|
||||
|
||||
|
||||
def main() -> None:
|
||||
p = argparse.ArgumentParser(description="Import web_design_marker JSONL files into brain.sqlite")
|
||||
p.add_argument("--db", default="second-brain/data/brain.sqlite", help="Path to brain.sqlite")
|
||||
p.add_argument("--glob", default="/tmp/web_design_markers_*.jsonl", help="Glob for marker JSONL files")
|
||||
p.add_argument("--source", default="web_research", help="Engram source")
|
||||
p.add_argument("--verdict", default="probable_true", help="Correctness verdict")
|
||||
p.add_argument("--agent-id", default="web_research_import", help="Agent id to record")
|
||||
p.add_argument("--dry-run", action="store_true", help="Parse/dedupe but do not write to DB")
|
||||
args = p.parse_args()
|
||||
|
||||
db_path = Path(args.db)
|
||||
jsonl_paths = sorted(Path("/").glob(args.glob.lstrip("/"))) if args.glob.startswith("/") else sorted(Path(".").glob(args.glob))
|
||||
if not jsonl_paths:
|
||||
raise SystemExit(f"No files match glob: {args.glob}")
|
||||
|
||||
stats = import_markers(
|
||||
db_path=db_path,
|
||||
jsonl_paths=jsonl_paths,
|
||||
source=args.source,
|
||||
verdict=args.verdict,
|
||||
agent_id=args.agent_id,
|
||||
dry_run=bool(args.dry_run),
|
||||
)
|
||||
|
||||
print(json.dumps({"db": str(db_path), "files": [str(p) for p in jsonl_paths], "stats": stats}, ensure_ascii=False, indent=2))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
163
scripts/process_pending_engrams.py
Normal file
163
scripts/process_pending_engrams.py
Normal file
@@ -0,0 +1,163 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Process pending second brain engrams.
|
||||
- For unconfirmed, unrejected engrams: evaluate confidence
|
||||
- If confidence > 0.8: confirm
|
||||
- If confidence < 0.3: reject
|
||||
- Otherwise: mark for review (leave as is)
|
||||
- Check for stale topics and archive if needed
|
||||
- Produce summary report
|
||||
"""
|
||||
|
||||
import sys
|
||||
import json
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
# Add src to path and set PYTHONPATH for proper module resolution
|
||||
base_dir = Path(__file__).parent.parent
|
||||
sys.path.insert(0, str(base_dir / "src"))
|
||||
|
||||
# Import using absolute module paths
|
||||
from src.store import EngramStore
|
||||
from src.engram import Engram, Grounding
|
||||
|
||||
DB_PATH = Path(__file__).parent.parent / "data" / "brain.sqlite"
|
||||
|
||||
|
||||
def is_stale(engram: Engram, days_threshold: int = 90) -> bool:
|
||||
"""Check if an engram is stale (old and rarely accessed)."""
|
||||
created = engram.metadata.get("created", "")
|
||||
access_count = engram.metadata.get("access_count", 0)
|
||||
last_accessed = engram.metadata.get("last_accessed", created)
|
||||
|
||||
try:
|
||||
created_dt = datetime.fromisoformat(created.replace("Z", "+00:00"))
|
||||
last_accessed_dt = datetime.fromisoformat(last_accessed.replace("Z", "+00:00"))
|
||||
age_days = (datetime.now(timezone.utc) - created_dt).total_seconds() / 86400
|
||||
days_since_access = (datetime.now(timezone.utc) - last_accessed_dt).total_seconds() / 86400
|
||||
|
||||
# Stale if: old (>90 days) AND rarely accessed (<3 times) AND not accessed recently (>60 days)
|
||||
if age_days > days_threshold and access_count < 3 and days_since_access > 60:
|
||||
return True
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def process_pending_engrams():
|
||||
"""Main processing function."""
|
||||
store = EngramStore(str(DB_PATH))
|
||||
|
||||
# Get all engrams
|
||||
all_engrams = store.get_all(limit=10000)
|
||||
print(f"Total engrams in database: {len(all_engrams)}")
|
||||
|
||||
# Filter pending (unconfirmed and unrejected)
|
||||
# Unconfirmed: not confirmed_true, not confirmed_false
|
||||
pending = []
|
||||
for eg in all_engrams:
|
||||
verdict = eg.correctness.verdict
|
||||
if verdict not in ("confirmed_true", "confirmed_false"):
|
||||
pending.append(eg)
|
||||
|
||||
print(f"Pending engrams (unconfirmed/unrejected): {len(pending)}")
|
||||
|
||||
actions = {
|
||||
"confirmed": 0,
|
||||
"rejected": 0,
|
||||
"left_for_review": 0,
|
||||
"archived_stale": 0,
|
||||
"errors": 0
|
||||
}
|
||||
|
||||
details = []
|
||||
|
||||
for eg in pending:
|
||||
try:
|
||||
confidence = eg.compute_confidence()
|
||||
engram_id = str(eg.id)
|
||||
content_preview = eg.content[:80] + ("..." if len(eg.content) > 80 else "")
|
||||
|
||||
# Check if stale and should be archived
|
||||
if is_stale(eg):
|
||||
# For stale engrams, we'll mark them in metadata for archiving
|
||||
# Instead of deleting, we'll add an "archived" tag and lower their priority
|
||||
tags = eg.metadata.get("tags", [])
|
||||
if "archived" not in tags:
|
||||
tags.append("archived")
|
||||
eg.metadata["tags"] = tags
|
||||
eg.metadata["archived_at"] = datetime.now(timezone.utc).isoformat()
|
||||
store.save(eg)
|
||||
actions["archived_stale"] += 1
|
||||
details.append(f"📦 Archived stale: [{engram_id[:8]}] {content_preview} (conf: {confidence:.2f})")
|
||||
# Even if stale, we still evaluate confidence for reporting
|
||||
# But we don't confirm/reject stale ones automatically unless confidence is extreme
|
||||
# Actually, the task says to check for stale topics and archive if needed. We've done that.
|
||||
# We still need to apply confidence thresholds to non-stale or all pending?
|
||||
# Let's continue to evaluate all pending, including stale, but maybe skip confirm/reject for stale?
|
||||
# The task: "For each pending engram... evaluate... If >0.8 confirm, <0.3 reject, otherwise mark for review"
|
||||
# It doesn't say to skip stale ones. So we'll still apply thresholds.
|
||||
# But we already archived it. We can still confirm/reject it if confidence is extreme.
|
||||
# Let's continue.
|
||||
|
||||
# Apply confidence thresholds
|
||||
if confidence > 0.8:
|
||||
eg.correctness.confirm(by="auto_processor", note=f"Auto-confirmed: confidence {confidence:.2f}")
|
||||
store.save(eg)
|
||||
actions["confirmed"] += 1
|
||||
details.append(f"✅ Confirmed: [{engram_id[:8]}] {content_preview} (conf: {confidence:.2f})")
|
||||
elif confidence < 0.3:
|
||||
eg.correctness.reject(by="auto_processor", note=f"Auto-rejected: confidence {confidence:.2f}")
|
||||
store.save(eg)
|
||||
actions["rejected"] += 1
|
||||
details.append(f"❌ Rejected: [{engram_id[:8]}] {content_preview} (conf: {confidence:.2f})")
|
||||
else:
|
||||
actions["left_for_review"] += 1
|
||||
details.append(f"⏳ Review later: [{engram_id[:8]}] {content_preview} (conf: {confidence:.2f})")
|
||||
|
||||
except Exception as e:
|
||||
actions["errors"] += 1
|
||||
details.append(f"⚠️ Error processing engram: {str(e)}")
|
||||
|
||||
# Generate summary report
|
||||
report_lines = []
|
||||
report_lines.append("=" * 60)
|
||||
report_lines.append("PENDING ENGRAMS PROCESSING REPORT")
|
||||
report_lines.append("=" * 60)
|
||||
report_lines.append(f"Timestamp: {datetime.now(timezone.utc).isoformat()}")
|
||||
report_lines.append(f"Total engrams: {len(all_engrams)}")
|
||||
report_lines.append(f"Pending engrams processed: {len(pending)}")
|
||||
report_lines.append("")
|
||||
report_lines.append("ACTIONS TAKEN:")
|
||||
report_lines.append(f" ✅ Auto-confirmed (confidence > 0.8): {actions['confirmed']}")
|
||||
report_lines.append(f" ❌ Auto-rejected (confidence < 0.3): {actions['rejected']}")
|
||||
report_lines.append(f" ⏳ Left for review (0.3 ≤ confidence ≤ 0.8): {actions['left_for_review']}")
|
||||
report_lines.append(f" 📦 Archived stale topics: {actions['archived_stale']}")
|
||||
report_lines.append(f" ⚠️ Errors: {actions['errors']}")
|
||||
report_lines.append("")
|
||||
report_lines.append("DETAILS:")
|
||||
report_lines.extend(details)
|
||||
report_lines.append("")
|
||||
report_lines.append("=" * 60)
|
||||
|
||||
report = "\n".join(report_lines)
|
||||
|
||||
# Print to stdout
|
||||
print("\n" + report)
|
||||
|
||||
# Save report to file
|
||||
report_dir = Path(__file__).parent.parent / "reports"
|
||||
report_dir.mkdir(parents=True, exist_ok=True)
|
||||
report_file = report_dir / f"pending_engrams_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt"
|
||||
report_file.write_text(report, encoding="utf-8")
|
||||
print(f"\n📄 Report saved to: {report_file}")
|
||||
|
||||
store.close()
|
||||
return actions
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
result = process_pending_engrams()
|
||||
print("\nProcessing complete.")
|
||||
Reference in New Issue
Block a user