diff --git a/src/app_dashboard.py b/src/app_dashboard.py new file mode 100644 index 0000000..ff00f22 --- /dev/null +++ b/src/app_dashboard.py @@ -0,0 +1,174 @@ +""" +app_dashboard.py - Streamlit-Dashboard für Second Brain. +Seiten: Übersicht, Engramme, Suche, Graph, Stats. +""" + +import json +import sys +from pathlib import Path + +import streamlit as st + +sys.path.insert(0, str(Path(__file__).resolve().parent)) + +from src.engram import Engram +from src.store import EngramStore +from src.chroma_store import ChromaStore +from src.retriever import Retriever +from src.neural_scorer import NeuralScorer + +_DEFAULT_DB = Path(__file__).resolve().parent.parent / "data" / "brain.sqlite" +_DB_PATH = str(st.secrets.get("db_path", _DEFAULT_DB) if hasattr(st, "secrets") else _DEFAULT_DB) + + +def _store(): + return EngramStore(_DB_PATH) + + +def _chroma(): + p = Path(_DB_PATH).parent / "chroma" + return ChromaStore(str(p)) + + +def _retriever(): + return Retriever(_store(), _chroma()) + + +def _scorer(): + return NeuralScorer() + + +st.set_page_config(page_title="Second Brain Dashboard", layout="wide") +st.title("🧠 Second Brain Dashboard") + +page = st.sidebar.radio("Seite", ["Übersicht", "Engramme", "Suche", "Graph", "Stats", "Neural Scorer"]) + + +if page == "Übersicht": + store = _store() + engrams = store.get_all() + confirmed = sum(1 for e in engrams if e.correctness.confirmed) + unconfirmed = len(engrams) - confirmed + avg_conf = sum(e.compute_confidence() for e in engrams) / max(1, len(engrams)) + + c1, c2, c3, c4 = st.columns(4) + c1.metric("Total", len(engrams)) + c2.metric("Confirmed", confirmed) + c3.metric("Pending", unconfirmed) + c4.metric("Avg Confidence", f"{avg_conf:.2f}") + + st.subheader("Recent Engramme") + for eg in sorted(engrams, key=lambda e: e.metadata.get("modified", ""), reverse=True)[:5]: + with st.expander(f"{eg.content[:80]}..."): + st.write(f"Source: {eg.metadata.get('source')}") + st.write(f"Confidence: {eg.compute_confidence():.2f}") + st.write(f"Confirmed: {'✅' if eg.correctness.confirmed else '❓'}") + st.write("Tags:", ", ".join(eg.metadata.get("tags", []))) + + +elif page == "Engramme": + store = _store() + st.subheader("Alle Engramme") + tag_filter = st.text_input("Filter tags") + source_filter = st.selectbox("Source", ["alle", "user", "agent", "web", "file", "system"]) + for eg in store.get_all(): + tags = eg.metadata.get("tags", []) + src = eg.metadata.get("source", "") + if tag_filter and tag_filter not in tags: + continue + if source_filter != "alle" and source_filter != src: + continue + with st.expander(f"{eg.content[:100]}"): + st.write("Confidence:", f"{eg.compute_confidence():.2f}") + st.write("Tags:", ", ".join(tags)) + st.write("Source:", src) + c1, c2 = st.columns(2) + if c1.button("✅ Confirm", key=f"conf_{eg.id}"): + eg.correctness.confirm("user") + store.save(eg) + st.success("Confirmed!") + if c2.button("❌ Reject", key=f"rej_{eg.id}"): + eg.correctness.reject("user") + store.save(eg) + st.warning("Rejected.") + + +elif page == "Suche": + st.subheader("Semantic + Keyword Suche") + query = st.text_input("Query") + mode = st.radio("Modus", ["Hybrid", "Keyword", "Semantic"]) + if st.button("Suchen") and query: + ret = _retriever() + if mode == "Hybrid": + results = ret.hybrid_retrieve(query, limit=10) + elif mode == "Semantic": + results = ret.semantic_retrieve(query, limit=10) + else: + results = ret.retrieve(query, limit=10) + for r in results: + eg = r["engram"] + with st.container(): + st.markdown(f"**{eg.content[:200]}...**") + st.write(f"Score: {r['score']:.3f} | Match: {r['match_type']} | Conf: {eg.compute_confidence():.2f}") + c1, c2 = st.columns(2) + if c1.button("✅ Confirm", key=f"sc_{eg.id}"): + eg.correctness.confirm("user") + store = _store() + store.save(eg) + st.success("Confirmed") + if c2.button("❌ Reject", key=f"sr_{eg.id}"): + eg.correctness.reject("user") + store = _store() + store.save(eg) + st.warning("Rejected") + + +elif page == "Graph": + st.subheader("Graph-Visualisierung") + graph_html_path = Path(_DB_PATH).parent / "graph_view.html" + if graph_html_path.exists(): + with open(graph_html_path, "r", encoding="utf-8") as f: + html = f.read() + # iframe + st.components.v1.html(html, height=800, scrolling=True) + else: + st.info("Graph nicht generiert. Führe `python -m src.cli graph` aus.") + if st.button("Graph generieren"): + from src.graph_view import generate_graph_html + store = _store() + path = generate_graph_html(store, str(Path(_DB_PATH).parent / "graph_view.html")) + st.success(f"Graph generiert: {path}") + + +elif page == "Stats": + store = _store() + engrams = store.get_all() + st.json({ + "total": len(engrams), + "confirmed": sum(1 for e in engrams if e.correctness.confirmed), + "pending": sum(1 for e in engrams if not e.correctness.confirmed), + "sources": {s: sum(1 for e in engrams if e.metadata.get("source") == s) for s in {e.metadata.get("source") for e in engrams}}, + "tags": {t: sum(1 for e in engrams for t2 in e.metadata.get("tags", []) if t2 == t) for t in {t for e in engrams for t in e.metadata.get("tags", [])}}, + "avg_confidence": sum(e.compute_confidence() for e in engrams) / max(1, len(engrams)), + }) + + +elif page == "Neural Scorer": + st.subheader("Neural Scorer Training") + scorer = _scorer() + store = _store() + engrams = store.get_all() + labeled = [e for e in engrams if e.correctness.confirmed or e.correctness.rejections > 0] + st.write(f"Labelled Engramme: {len(labeled)}") + if st.button("Train Neural Scorer"): + if len(labeled) < 2: + st.error("Mindestens 2 labelierte Engramme nötig (confirm + reject).") + else: + result = scorer.train(labeled, epochs=30) + st.json(result) + st.success("Training abgeschlossen!") + + if st.button("Predict All"): + for eg in engrams[:10]: + pred = scorer.predict(eg) + st.write(f"{eg.content[:60]}... → {pred:.3f}") diff --git a/src/chroma_store.py b/src/chroma_store.py new file mode 100644 index 0000000..d3dbb65 --- /dev/null +++ b/src/chroma_store.py @@ -0,0 +1,119 @@ +""" +chroma_store.py - ChromaDB Vektor-Speicher für semantische Suche. +Erweitert den SQLite-Store um Vektor-ähnlichkeit. +""" + +import json +from pathlib import Path +from typing import List, Optional, Dict, Any +from uuid import UUID + +import chromadb +from chromadb.config import Settings + +from .engram import Engram +from .embedder import encode + + +class ChromaStore: + """ + ChromaDB-basierter Vektor-Speicher. + Speichert Engramme als Vektoren mit Metadaten. + """ + + def __init__(self, path: str = "data/chroma"): + self.path = Path(path) + self.path.mkdir(parents=True, exist_ok=True) + self.client = chromadb.PersistentClient(path=str(self.path)) + self.collection = self.client.get_or_create_collection( + name="engrams", + metadata={"hnsw:space": "cosine"}, + ) + + def _build_metadata(self, engram: Engram) -> Dict[str, Any]: + """Serialisierte Metadaten für ChromaDB (nur primitives).""" + meta = engram.metadata.copy() + # ChromaDB akzeptiert nur Listen/Strings/Numbers/Bools + tags = meta.pop("tags", []) + if isinstance(tags, list): + meta["tags"] = ",".join(str(t) for t in tags) + meta.setdefault("source", "agent") + meta.setdefault("confidence", 0.5) + meta.setdefault("correctness", "unconfirmed") + # Hierarchy als JSON-String + if "hierarchy" in meta: + meta["hierarchy"] = json.dumps(meta["hierarchy"]) + return meta + + def add(self, engram: Engram, embedding: Optional[List[float]] = None) -> None: + """Engramm mit Embedding zur Vektor-DB hinzufügen.""" + eid = str(engram.id) + emb = embedding or engram.embedding + if emb is None: + emb = encode(engram.content) + if emb is None: + return + + meta = self._build_metadata(engram) + meta["content"] = engram.content[:1000] # Chroma likes short strings + self.collection.add( + ids=[eid], + embeddings=[emb], + metadatas=[meta], + ) + + def update(self, engram: Engram, embedding: Optional[List[float]] = None) -> None: + """Engramm aktualisieren.""" + eid = str(engram.id) + emb = embedding or engram.embedding + if emb is None: + emb = encode(engram.content) + if emb is None: + return + meta = self._build_metadata(engram) + self.collection.update( + ids=[eid], + embeddings=[emb], + metadatas=[meta], + ) + + def delete(self, eid: str) -> None: + """Engramm aus Vektor-DB entfernen.""" + self.collection.delete(ids=[eid]) + + def query(self, text: str, top_k: int = 5, filters: Optional[Dict] = None) -> List[Dict[str, Any]]: + """Semantische Suche.""" + emb = encode(text) + if emb is None: + return [] + results = self.collection.query( + query_embeddings=[emb], + n_results=top_k, + where=filters, + include=["metadatas", "distances", "documents"], + ) + out = [] + for i in range(len(results["ids"][0])): + out.append({ + "id": results["ids"][0][i], + "distance": results["distances"][0][i], + "metadata": results["metadatas"][0][i], + }) + return out + + def get_by_id(self, eid: str) -> Optional[Dict[str, Any]]: + """Einzelnes Engramm via ID.""" + try: + r = self.collection.get(ids=[eid], include=["embeddings", "metadatas"]) + if r and r["ids"]: + return { + "id": r["ids"][0], + "embedding": r["embeddings"][0] if "embeddings" in r else None, + "metadata": r["metadatas"][0] if "metadatas" in r else {}, + } + except Exception as e: + print(f"[chroma_store] get_by_id failed: {e}") + return None + + def count(self) -> int: + return self.collection.count() diff --git a/src/embedder.py b/src/embedder.py new file mode 100644 index 0000000..02e049e --- /dev/null +++ b/src/embedder.py @@ -0,0 +1,116 @@ +""" +embedder.py - Sentence-Transformer Embedding-Modul. +Offlined-fähig, cached auf Disk. +""" +import json +import hashlib +import os +from pathlib import Path +from typing import List, Optional +import numpy as np +from sentence_transformers import SentenceTransformer + +_MODEL_NAME = "all-MiniLM-L6-v2" +_EMBED_DIM = 384 +_CACHE_DIR = Path(__file__).resolve().parent.parent / "data" / "embedding_cache" + +__model: Optional[SentenceTransformer] = None + + +def _get_model() -> SentenceTransformer: + global __model + if __model is None: + __model = SentenceTransformer(_MODEL_NAME) + return __model + + +def _text_hash(text: str) -> str: + return hashlib.sha256(text.encode("utf-8")).hexdigest() + + +def _cache_path(h: str) -> Path: + _CACHE_DIR.mkdir(parents=True, exist_ok=True) + return _CACHE_DIR / f"{h}.json" + + +def encode(text: str, cache: bool = True, normalize: bool = True) -> Optional[List[float]]: + """Embeddiert einen Text. Gibt None zurück wenn Modell nicht verfügbar.""" + try: + h = _text_hash(text) + cp = _cache_path(h) + if cache and cp.exists(): + with open(cp, "r", encoding="utf-8") as f: + data = json.load(f) + return data["embedding"] + + model = _get_model() + vec = model.encode(text, convert_to_numpy=True) + if normalize: + norm = np.linalg.norm(vec) + if norm > 0: + vec = vec / norm + vec_list = vec.tolist() + + if cache: + with open(cp, "w", encoding="utf-8") as f: + json.dump({"text": text, "embedding": vec_list}, f, ensure_ascii=False) + + return vec_list + except Exception as e: + print(f"[embedder] Encoding failed: {e}") + return None + + +def encode_batch(texts: List[str], cache: bool = True, normalize: bool = True) -> List[Optional[List[float]]]: + """Embeddiert mehrere Texte.""" + try: + results: List[Optional[List[float]]] = [] + to_encode: List[str] = [] + idx_map: List[int] = [] + + for i, text in enumerate(texts): + h = _text_hash(text) + cp = _cache_path(h) + if cache and cp.exists(): + with open(cp, "r", encoding="utf-8") as f: + data = json.load(f) + results.append(data["embedding"]) + else: + results.append(None) + to_encode.append(text) + idx_map.append(i) + + if to_encode: + model = _get_model() + vecs = model.encode(to_encode, convert_to_numpy=True) + if normalize: + norms = np.linalg.norm(vecs, axis=1, keepdims=True) + norms[norms == 0] = 1 + vecs = vecs / norms + for m, vec in zip(idx_map, vecs): + vec_list = vec.tolist() + results[m] = vec_list + if cache: + h = _text_hash(texts[m]) + cp = _cache_path(h) + with open(cp, "w", encoding="utf-8") as f: + json.dump({"text": texts[m], "embedding": vec_list}, f, ensure_ascii=False) + + return results + except Exception as e: + print(f"[embedder] Batch encoding failed: {e}") + return [None] * len(texts) + + +def similar(query: str, candidates: List[str], top_k: int = 5) -> List[tuple]: + """Gibt die top-k besten Kandidaten für eine Query zurück.""" + q_vec = np.array(encode(query)) + c_vecs = encode_batch(candidates) + scores = [] + for i, c_vec in enumerate(c_vecs): + if c_vec is not None: + c_arr = np.array(c_vec) + score = float(np.dot(q_vec, c_arr)) + scores.append((i, score)) + scores.sort(key=lambda x: x[1], reverse=True) + return [(candidates[i], s) for i, s in scores[:top_k]] diff --git a/src/graph_view.py b/src/graph_view.py new file mode 100644 index 0000000..ace91fd --- /dev/null +++ b/src/graph_view.py @@ -0,0 +1,184 @@ +""" +graph_view.py - Generiert interaktive Graph-Visualisierung (Cytoscape.js). +""" + +import json +from pathlib import Path +from typing import Optional +from .store import EngramStore + + +_HTML_TEMPLATE = """ + +
+ +Knoten: Engramme (Farbe = Confidence)
+Grün=hoch, Gelb=mittel, Rot=niedrig
+Links: Verknüpfungen
+Klicke für Details
+