mirror of
https://github.com/github/awesome-copilot.git
synced 2026-05-05 14:42:12 +00:00
add mini-context-graph skill (#1580)
* add mini-context-graph skill * remove pycache files * filename case update to SKILL.md * update readme
This commit is contained in:
@@ -0,0 +1,191 @@
|
||||
"""
|
||||
documents_store.py — Persistent storage for raw documents and chunks (RAG layer).
|
||||
|
||||
Inspired by Karpathy's LLM Wiki pattern: raw sources are immutable and stored
|
||||
as the ground truth. Chunks are the retrieval unit; provenance links tie graph
|
||||
nodes/edges back to specific chunks.
|
||||
|
||||
Handles:
|
||||
- Storing raw documents with metadata
|
||||
- Chunking documents into overlapping text windows
|
||||
- Retrieving chunks by id or by keyword search
|
||||
- Persisting to data/documents.json
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import uuid
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
import config
|
||||
|
||||
_DATA_DIR = Path(os.environ.get("MINI_CONTEXT_GRAPH_DATA_DIR", str(config.DATA_DIR)))
|
||||
_DOCS_FILE = _DATA_DIR / "documents.json"
|
||||
|
||||
_CHUNK_SIZE = 500 # characters per chunk
|
||||
_CHUNK_OVERLAP = 100 # overlap between consecutive chunks
|
||||
|
||||
_STOPWORDS = frozenset([
|
||||
"a", "an", "the", "is", "are", "was", "were", "be", "been", "being",
|
||||
"have", "has", "had", "do", "does", "did", "will", "would", "could",
|
||||
"should", "may", "might", "shall", "can", "to", "of", "in", "on",
|
||||
"at", "by", "for", "with", "from", "and", "or", "but", "not", "it",
|
||||
"its", "this", "that", "these", "those", "i", "you", "he", "she",
|
||||
"we", "they", "what", "which", "who", "how", "why", "when", "where",
|
||||
])
|
||||
|
||||
|
||||
def _load() -> dict:
|
||||
if _DOCS_FILE.exists():
|
||||
with open(_DOCS_FILE, "r") as f:
|
||||
return json.load(f)
|
||||
return {"documents": {}}
|
||||
|
||||
|
||||
def _save(store: dict) -> None:
|
||||
_DATA_DIR.mkdir(parents=True, exist_ok=True)
|
||||
with open(_DOCS_FILE, "w") as f:
|
||||
json.dump(store, f, indent=2)
|
||||
|
||||
|
||||
def _tokenize(text: str) -> list[str]:
|
||||
tokens = re.findall(r"[a-z0-9]+", text.lower())
|
||||
return [t for t in tokens if t not in _STOPWORDS and len(t) > 1]
|
||||
|
||||
|
||||
def _chunk_text(content: str, chunk_size: int = _CHUNK_SIZE, overlap: int = _CHUNK_OVERLAP) -> list[str]:
|
||||
"""Split content into overlapping character windows."""
|
||||
chunks = []
|
||||
start = 0
|
||||
while start < len(content):
|
||||
end = start + chunk_size
|
||||
chunks.append(content[start:end].strip())
|
||||
if end >= len(content):
|
||||
break
|
||||
start += chunk_size - overlap
|
||||
return [c for c in chunks if c]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Public API
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def add_document(
|
||||
doc_id: str,
|
||||
title: str,
|
||||
source: str,
|
||||
content: str,
|
||||
) -> dict:
|
||||
"""
|
||||
Store a raw document and auto-generate chunks.
|
||||
|
||||
Args:
|
||||
doc_id: Caller-supplied stable identifier (e.g. "doc_001" or a filename).
|
||||
title: Human-readable title.
|
||||
source: Origin path/URL (immutable provenance pointer).
|
||||
content: Full raw text to store and chunk.
|
||||
|
||||
Returns:
|
||||
The stored document dict including generated chunk_ids.
|
||||
"""
|
||||
store = _load()
|
||||
|
||||
# Idempotent: return existing doc if already stored
|
||||
if doc_id in store["documents"]:
|
||||
return store["documents"][doc_id]
|
||||
|
||||
raw_chunks = _chunk_text(content)
|
||||
chunks = []
|
||||
for i, text in enumerate(raw_chunks):
|
||||
chunks.append({
|
||||
"chunk_id": f"{doc_id}_chunk_{i:03d}",
|
||||
"index": i,
|
||||
"text": text,
|
||||
})
|
||||
|
||||
doc = {
|
||||
"id": doc_id,
|
||||
"title": title,
|
||||
"source": source,
|
||||
"content": content,
|
||||
"chunks": chunks,
|
||||
"ingestion_date": datetime.now(timezone.utc).isoformat(),
|
||||
}
|
||||
store["documents"][doc_id] = doc
|
||||
_save(store)
|
||||
return doc
|
||||
|
||||
|
||||
def get_document(doc_id: str) -> dict | None:
|
||||
"""Return the full document record or None if not found."""
|
||||
store = _load()
|
||||
return store["documents"].get(doc_id)
|
||||
|
||||
|
||||
def get_chunk(chunk_id: str) -> dict | None:
|
||||
"""Return a specific chunk by its chunk_id (searches across all documents)."""
|
||||
store = _load()
|
||||
for doc in store["documents"].values():
|
||||
for chunk in doc["chunks"]:
|
||||
if chunk["chunk_id"] == chunk_id:
|
||||
return chunk
|
||||
return None
|
||||
|
||||
|
||||
def get_chunks_for_document(doc_id: str) -> list[dict]:
|
||||
"""Return all chunks for a document."""
|
||||
doc = get_document(doc_id)
|
||||
if doc is None:
|
||||
return []
|
||||
return doc["chunks"]
|
||||
|
||||
|
||||
def search_chunks(query: str, top_k: int = 5) -> list[dict]:
|
||||
"""
|
||||
Keyword search over chunk text. Returns top_k matching chunks sorted by
|
||||
term overlap (simple TF-style scoring, no embeddings required).
|
||||
|
||||
Returns list of dicts with keys: chunk_id, doc_id, score, text.
|
||||
"""
|
||||
store = _load()
|
||||
query_tokens = set(_tokenize(query))
|
||||
if not query_tokens:
|
||||
return []
|
||||
|
||||
scored: list[tuple[float, dict]] = []
|
||||
for doc in store["documents"].values():
|
||||
for chunk in doc["chunks"]:
|
||||
chunk_tokens = set(_tokenize(chunk["text"]))
|
||||
overlap = len(query_tokens & chunk_tokens)
|
||||
if overlap > 0:
|
||||
score = overlap / len(query_tokens)
|
||||
scored.append((score, {
|
||||
"chunk_id": chunk["chunk_id"],
|
||||
"doc_id": doc["id"],
|
||||
"doc_title": doc["title"],
|
||||
"score": round(score, 4),
|
||||
"text": chunk["text"],
|
||||
}))
|
||||
|
||||
scored.sort(key=lambda x: x[0], reverse=True)
|
||||
return [item for _, item in scored[:top_k]]
|
||||
|
||||
|
||||
def list_documents() -> list[dict]:
|
||||
"""Return a summary list of all stored documents (no content, no chunks)."""
|
||||
store = _load()
|
||||
return [
|
||||
{
|
||||
"id": doc["id"],
|
||||
"title": doc["title"],
|
||||
"source": doc["source"],
|
||||
"chunk_count": len(doc["chunks"]),
|
||||
"ingestion_date": doc["ingestion_date"],
|
||||
}
|
||||
for doc in store["documents"].values()
|
||||
]
|
||||
Reference in New Issue
Block a user