add mini-context-graph skill (#1580)

* add mini-context-graph skill * remove pycache files * filename case update to SKILL.md * update readme
2026-06-19 22:17:43 +00:00 · 2026-05-05 09:34:37 +05:30
parent 1f96bce626
commit 746ba555b6
16 changed files with 2343 additions and 0 deletions
@@ -0,0 +1,191 @@
+"""
+documents_store.py — Persistent storage for raw documents and chunks (RAG layer).
+
+Inspired by Karpathy's LLM Wiki pattern: raw sources are immutable and stored
+as the ground truth. Chunks are the retrieval unit; provenance links tie graph
+nodes/edges back to specific chunks.
+
+Handles:
+- Storing raw documents with metadata
+- Chunking documents into overlapping text windows
+- Retrieving chunks by id or by keyword search
+- Persisting to data/documents.json
+"""
+from __future__ import annotations
+
+import json
+import os
+import re
+import sys
+import uuid
+from datetime import datetime, timezone
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).parent.parent))
+import config
+
+_DATA_DIR = Path(os.environ.get("MINI_CONTEXT_GRAPH_DATA_DIR", str(config.DATA_DIR)))
+_DOCS_FILE = _DATA_DIR / "documents.json"
+
+_CHUNK_SIZE = 500       # characters per chunk
+_CHUNK_OVERLAP = 100    # overlap between consecutive chunks
+
+_STOPWORDS = frozenset([
+    "a", "an", "the", "is", "are", "was", "were", "be", "been", "being",
+    "have", "has", "had", "do", "does", "did", "will", "would", "could",
+    "should", "may", "might", "shall", "can", "to", "of", "in", "on",
+    "at", "by", "for", "with", "from", "and", "or", "but", "not", "it",
+    "its", "this", "that", "these", "those", "i", "you", "he", "she",
+    "we", "they", "what", "which", "who", "how", "why", "when", "where",
+])
+
+
+def _load() -> dict:
+    if _DOCS_FILE.exists():
+        with open(_DOCS_FILE, "r") as f:
+            return json.load(f)
+    return {"documents": {}}
+
+
+def _save(store: dict) -> None:
+    _DATA_DIR.mkdir(parents=True, exist_ok=True)
+    with open(_DOCS_FILE, "w") as f:
+        json.dump(store, f, indent=2)
+
+
+def _tokenize(text: str) -> list[str]:
+    tokens = re.findall(r"[a-z0-9]+", text.lower())
+    return [t for t in tokens if t not in _STOPWORDS and len(t) > 1]
+
+
+def _chunk_text(content: str, chunk_size: int = _CHUNK_SIZE, overlap: int = _CHUNK_OVERLAP) -> list[str]:
+    """Split content into overlapping character windows."""
+    chunks = []
+    start = 0
+    while start < len(content):
+        end = start + chunk_size
+        chunks.append(content[start:end].strip())
+        if end >= len(content):
+            break
+        start += chunk_size - overlap
+    return [c for c in chunks if c]
+
+
+# ---------------------------------------------------------------------------
+# Public API
+# ---------------------------------------------------------------------------
+
+def add_document(
+    doc_id: str,
+    title: str,
+    source: str,
+    content: str,
+) -> dict:
+    """
+    Store a raw document and auto-generate chunks.
+
+    Args:
+        doc_id:  Caller-supplied stable identifier (e.g. "doc_001" or a filename).
+        title:   Human-readable title.
+        source:  Origin path/URL (immutable provenance pointer).
+        content: Full raw text to store and chunk.
+
+    Returns:
+        The stored document dict including generated chunk_ids.
+    """
+    store = _load()
+
+    # Idempotent: return existing doc if already stored
+    if doc_id in store["documents"]:
+        return store["documents"][doc_id]
+
+    raw_chunks = _chunk_text(content)
+    chunks = []
+    for i, text in enumerate(raw_chunks):
+        chunks.append({
+            "chunk_id": f"{doc_id}_chunk_{i:03d}",
+            "index": i,
+            "text": text,
+        })
+
+    doc = {
+        "id": doc_id,
+        "title": title,
+        "source": source,
+        "content": content,
+        "chunks": chunks,
+        "ingestion_date": datetime.now(timezone.utc).isoformat(),
+    }
+    store["documents"][doc_id] = doc
+    _save(store)
+    return doc
+
+
+def get_document(doc_id: str) -> dict | None:
+    """Return the full document record or None if not found."""
+    store = _load()
+    return store["documents"].get(doc_id)
+
+
+def get_chunk(chunk_id: str) -> dict | None:
+    """Return a specific chunk by its chunk_id (searches across all documents)."""
+    store = _load()
+    for doc in store["documents"].values():
+        for chunk in doc["chunks"]:
+            if chunk["chunk_id"] == chunk_id:
+                return chunk
+    return None
+
+
+def get_chunks_for_document(doc_id: str) -> list[dict]:
+    """Return all chunks for a document."""
+    doc = get_document(doc_id)
+    if doc is None:
+        return []
+    return doc["chunks"]
+
+
+def search_chunks(query: str, top_k: int = 5) -> list[dict]:
+    """
+    Keyword search over chunk text. Returns top_k matching chunks sorted by
+    term overlap (simple TF-style scoring, no embeddings required).
+
+    Returns list of dicts with keys: chunk_id, doc_id, score, text.
+    """
+    store = _load()
+    query_tokens = set(_tokenize(query))
+    if not query_tokens:
+        return []
+
+    scored: list[tuple[float, dict]] = []
+    for doc in store["documents"].values():
+        for chunk in doc["chunks"]:
+            chunk_tokens = set(_tokenize(chunk["text"]))
+            overlap = len(query_tokens & chunk_tokens)
+            if overlap > 0:
+                score = overlap / len(query_tokens)
+                scored.append((score, {
+                    "chunk_id": chunk["chunk_id"],
+                    "doc_id": doc["id"],
+                    "doc_title": doc["title"],
+                    "score": round(score, 4),
+                    "text": chunk["text"],
+                }))
+
+    scored.sort(key=lambda x: x[0], reverse=True)
+    return [item for _, item in scored[:top_k]]
+
+
+def list_documents() -> list[dict]:
+    """Return a summary list of all stored documents (no content, no chunks)."""
+    store = _load()
+    return [
+        {
+            "id": doc["id"],
+            "title": doc["title"],
+            "source": doc["source"],
+            "chunk_count": len(doc["chunks"]),
+            "ingestion_date": doc["ingestion_date"],
+        }
+        for doc in store["documents"].values()
+    ]