Files
awesome-copilot/skills/mini-context-graph/scripts/tools/documents_store.py
T
Nixon Kurian 746ba555b6 add mini-context-graph skill (#1580)
* add mini-context-graph skill

* remove pycache files

* filename case update to SKILL.md

* update readme
2026-05-05 14:04:37 +10:00

192 lines
5.7 KiB
Python

"""
documents_store.py — Persistent storage for raw documents and chunks (RAG layer).
Inspired by Karpathy's LLM Wiki pattern: raw sources are immutable and stored
as the ground truth. Chunks are the retrieval unit; provenance links tie graph
nodes/edges back to specific chunks.
Handles:
- Storing raw documents with metadata
- Chunking documents into overlapping text windows
- Retrieving chunks by id or by keyword search
- Persisting to data/documents.json
"""
from __future__ import annotations
import json
import os
import re
import sys
import uuid
from datetime import datetime, timezone
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent))
import config
_DATA_DIR = Path(os.environ.get("MINI_CONTEXT_GRAPH_DATA_DIR", str(config.DATA_DIR)))
_DOCS_FILE = _DATA_DIR / "documents.json"
_CHUNK_SIZE = 500 # characters per chunk
_CHUNK_OVERLAP = 100 # overlap between consecutive chunks
_STOPWORDS = frozenset([
"a", "an", "the", "is", "are", "was", "were", "be", "been", "being",
"have", "has", "had", "do", "does", "did", "will", "would", "could",
"should", "may", "might", "shall", "can", "to", "of", "in", "on",
"at", "by", "for", "with", "from", "and", "or", "but", "not", "it",
"its", "this", "that", "these", "those", "i", "you", "he", "she",
"we", "they", "what", "which", "who", "how", "why", "when", "where",
])
def _load() -> dict:
if _DOCS_FILE.exists():
with open(_DOCS_FILE, "r") as f:
return json.load(f)
return {"documents": {}}
def _save(store: dict) -> None:
_DATA_DIR.mkdir(parents=True, exist_ok=True)
with open(_DOCS_FILE, "w") as f:
json.dump(store, f, indent=2)
def _tokenize(text: str) -> list[str]:
tokens = re.findall(r"[a-z0-9]+", text.lower())
return [t for t in tokens if t not in _STOPWORDS and len(t) > 1]
def _chunk_text(content: str, chunk_size: int = _CHUNK_SIZE, overlap: int = _CHUNK_OVERLAP) -> list[str]:
"""Split content into overlapping character windows."""
chunks = []
start = 0
while start < len(content):
end = start + chunk_size
chunks.append(content[start:end].strip())
if end >= len(content):
break
start += chunk_size - overlap
return [c for c in chunks if c]
# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------
def add_document(
doc_id: str,
title: str,
source: str,
content: str,
) -> dict:
"""
Store a raw document and auto-generate chunks.
Args:
doc_id: Caller-supplied stable identifier (e.g. "doc_001" or a filename).
title: Human-readable title.
source: Origin path/URL (immutable provenance pointer).
content: Full raw text to store and chunk.
Returns:
The stored document dict including generated chunk_ids.
"""
store = _load()
# Idempotent: return existing doc if already stored
if doc_id in store["documents"]:
return store["documents"][doc_id]
raw_chunks = _chunk_text(content)
chunks = []
for i, text in enumerate(raw_chunks):
chunks.append({
"chunk_id": f"{doc_id}_chunk_{i:03d}",
"index": i,
"text": text,
})
doc = {
"id": doc_id,
"title": title,
"source": source,
"content": content,
"chunks": chunks,
"ingestion_date": datetime.now(timezone.utc).isoformat(),
}
store["documents"][doc_id] = doc
_save(store)
return doc
def get_document(doc_id: str) -> dict | None:
"""Return the full document record or None if not found."""
store = _load()
return store["documents"].get(doc_id)
def get_chunk(chunk_id: str) -> dict | None:
"""Return a specific chunk by its chunk_id (searches across all documents)."""
store = _load()
for doc in store["documents"].values():
for chunk in doc["chunks"]:
if chunk["chunk_id"] == chunk_id:
return chunk
return None
def get_chunks_for_document(doc_id: str) -> list[dict]:
"""Return all chunks for a document."""
doc = get_document(doc_id)
if doc is None:
return []
return doc["chunks"]
def search_chunks(query: str, top_k: int = 5) -> list[dict]:
"""
Keyword search over chunk text. Returns top_k matching chunks sorted by
term overlap (simple TF-style scoring, no embeddings required).
Returns list of dicts with keys: chunk_id, doc_id, score, text.
"""
store = _load()
query_tokens = set(_tokenize(query))
if not query_tokens:
return []
scored: list[tuple[float, dict]] = []
for doc in store["documents"].values():
for chunk in doc["chunks"]:
chunk_tokens = set(_tokenize(chunk["text"]))
overlap = len(query_tokens & chunk_tokens)
if overlap > 0:
score = overlap / len(query_tokens)
scored.append((score, {
"chunk_id": chunk["chunk_id"],
"doc_id": doc["id"],
"doc_title": doc["title"],
"score": round(score, 4),
"text": chunk["text"],
}))
scored.sort(key=lambda x: x[0], reverse=True)
return [item for _, item in scored[:top_k]]
def list_documents() -> list[dict]:
"""Return a summary list of all stored documents (no content, no chunks)."""
store = _load()
return [
{
"id": doc["id"],
"title": doc["title"],
"source": doc["source"],
"chunk_count": len(doc["chunks"]),
"ingestion_date": doc["ingestion_date"],
}
for doc in store["documents"].values()
]