mirror of
https://github.com/github/awesome-copilot.git
synced 2026-05-05 06:35:56 +00:00
add mini-context-graph skill (#1580)
* add mini-context-graph skill * remove pycache files * filename case update to SKILL.md * update readme
This commit is contained in:
@@ -0,0 +1,194 @@
|
||||
---
|
||||
name: mini-context-graph
|
||||
description: |
|
||||
A persistent, compounding knowledge base combining Karpathy's LLM Wiki pattern
|
||||
with a structured knowledge graph. Ingest documents once — the LLM writes wiki
|
||||
pages, extracts entities/relations into the graph, and stores raw content for
|
||||
evidence retrieval. Knowledge accumulates and cross-references; it is never
|
||||
re-derived from scratch.
|
||||
---
|
||||
|
||||
# Mini Context Graph Skill
|
||||
|
||||
## The Core Idea
|
||||
|
||||
Standard RAG re-discovers knowledge from scratch on every query. This skill is different:
|
||||
|
||||
1. **Wiki layer** — The LLM writes and maintains persistent markdown pages (summaries, entity pages, topic syntheses). Cross-references are already there. The wiki gets richer with every ingest.
|
||||
2. **Graph layer** — Entities and relations are extracted once and stored as a navigable knowledge graph. BFS traversal answers structural queries without re-reading sources.
|
||||
3. **Raw source layer** — Original documents are stored immutably with chunks. Provenance links tie every graph node and edge back to the exact text that supports it.
|
||||
|
||||
> The LLM writes; the Python tools handle all bookkeeping.
|
||||
|
||||
---
|
||||
|
||||
## Three Layers
|
||||
|
||||
| Layer | Where | What the LLM does | What Python does |
|
||||
|-------|-------|-------------------|-----------------|
|
||||
| **Raw Sources** | `data/documents.json` | Reads (never modifies) | Stores chunks + metadata |
|
||||
| **Wiki** | `wiki/` (markdown) | Writes/updates pages | Manages index.md + log.md |
|
||||
| **Graph** | `data/graph.json` | Extracts entities + relations | Persists, deduplicates, traverses |
|
||||
|
||||
---
|
||||
|
||||
## ⚡ Quick Start for Agents
|
||||
|
||||
```python
|
||||
from scripts.contextgraph import ContextGraphSkill
|
||||
from scripts.tools import wiki_store
|
||||
|
||||
skill = ContextGraphSkill()
|
||||
|
||||
# ===== INGEST WITH FULL RAG + WIKI =====
|
||||
# 1. Read references/ingestion.md and references/ontology.md first
|
||||
# 2. Extract entities and relations (LLM reasoning step)
|
||||
entities = [
|
||||
{"name": "memory leak", "type": "issue", "supporting_text": "memory leaks cause crashes"},
|
||||
{"name": "system crash", "type": "issue", "supporting_text": "system crashes due to memory leaks"},
|
||||
]
|
||||
relations = [
|
||||
{"source": "memory leak", "target": "system crash", "type": "causes",
|
||||
"confidence": 1.0, "supporting_text": "System crashes due to memory leaks."},
|
||||
]
|
||||
|
||||
result = skill.ingest_with_content(
|
||||
doc_id="doc_001",
|
||||
title="System Crash Analysis",
|
||||
source="/docs/incident_report.pdf",
|
||||
raw_content="System crashes due to memory leaks. Memory leaks occur when objects are not released.",
|
||||
entities=entities,
|
||||
relations=relations,
|
||||
)
|
||||
# result = {"doc_id": "doc_001", "chunk_count": 1, "nodes_added": 2, "edges_added": 1}
|
||||
|
||||
# 3. Write a wiki summary page for this document
|
||||
wiki_store.write_page(
|
||||
category="summary",
|
||||
title="System Crash Analysis Summary",
|
||||
content="""---
|
||||
title: System Crash Analysis
|
||||
source_document: doc_001
|
||||
tags: [summary, incident]
|
||||
---
|
||||
|
||||
# System Crash Analysis
|
||||
|
||||
**Source:** incident_report.pdf
|
||||
|
||||
## Key Claims
|
||||
|
||||
- [[memory-leak]] causes [[system-crash]] (confidence: 1.0)
|
||||
|
||||
## Entities
|
||||
|
||||
- [[memory-leak]] (issue)
|
||||
- [[system-crash]] (issue)
|
||||
""",
|
||||
summary="Incident report: memory leaks cause system crashes.",
|
||||
)
|
||||
|
||||
# ===== QUERY WITH EVIDENCE =====
|
||||
result = skill.query_with_evidence("Why does the system crash?")
|
||||
# Returns: {"query": ..., "subgraph": ..., "supporting_documents": [...], "evidence_chain": ...}
|
||||
|
||||
# ===== WIKI SEARCH (read wiki before answering) =====
|
||||
pages = wiki_store.search_wiki("memory leak")
|
||||
# Returns: [{slug, category, path, snippet}, ...]
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Operations
|
||||
|
||||
### Ingest
|
||||
|
||||
When a user provides a new document:
|
||||
|
||||
1. Read `references/ingestion.md` — entity/relation extraction rules.
|
||||
2. Read `references/ontology.md` — type normalization rules.
|
||||
3. Extract entities and relations using your LLM reasoning.
|
||||
4. Call `skill.ingest_with_content(...)` — stores raw content + chunks + graph nodes + provenance.
|
||||
5. **Write a wiki summary page** using `wiki_store.write_page(category="summary", ...)`.
|
||||
6. **Update entity pages** — for each new/updated entity, write or update `wiki_store.write_page(category="entity", ...)`.
|
||||
7. **Update topic pages** if the document touches an existing synthesis topic.
|
||||
8. A single document ingest will typically touch 3–10 wiki pages.
|
||||
|
||||
### Query
|
||||
|
||||
When a user asks a question:
|
||||
|
||||
1. **Check the wiki first** — `wiki_store.search_wiki(query)` to find relevant pages. Read them.
|
||||
2. If the wiki has a good answer, synthesize from wiki pages (fast path).
|
||||
3. If deeper graph traversal is needed, call `skill.query_with_evidence(query)`.
|
||||
4. Return the answer with evidence citations from `supporting_documents`.
|
||||
5. If the answer is valuable, file it back as a new wiki topic page.
|
||||
|
||||
### Lint
|
||||
|
||||
Periodically health-check the wiki:
|
||||
|
||||
```python
|
||||
from scripts.tools import wiki_store
|
||||
issues = wiki_store.lint_wiki()
|
||||
# Returns: {orphan_pages, missing_pages, broken_wikilinks, isolated_pages}
|
||||
```
|
||||
|
||||
Ask the LLM to review and fix: broken links, orphan pages, stale claims, missing cross-references. See `references/lint.md` for full lint workflow.
|
||||
|
||||
---
|
||||
|
||||
## Ingestion Constraints
|
||||
|
||||
- ❌ Do NOT hallucinate entities not present in the text
|
||||
- ❌ Do NOT add relations without explicit textual evidence
|
||||
- ❌ Do NOT add edges with confidence < 0.6
|
||||
- ✅ Provide `supporting_text` for every entity and relation — this enables provenance
|
||||
- ✅ Write a wiki summary page for every ingested document
|
||||
- ✅ Update existing entity pages when new information arrives
|
||||
- ✅ Flag contradictions in wiki pages when new data conflicts with old claims
|
||||
|
||||
---
|
||||
|
||||
## Retrieval Constraints
|
||||
|
||||
- 🔒 Traversal depth MUST NOT exceed 2 (config: MAX_GRAPH_DEPTH)
|
||||
- 🔒 Only edges with confidence ≥ 0.6 (config: MIN_CONFIDENCE)
|
||||
- 🔒 Maximum 50 nodes returned (config: MAX_NODES)
|
||||
- ❌ Do NOT fabricate nodes or edges not in the graph
|
||||
|
||||
---
|
||||
|
||||
## Full Python API Reference
|
||||
|
||||
| Method | Purpose | When to Use |
|
||||
|--------|---------|-------------|
|
||||
| `skill.ingest_with_content(doc_id, title, source, raw_content, entities, relations)` | Full RAG ingest: raw docs + graph + provenance | Every new document |
|
||||
| `skill.add_node(name, node_type)` | Add single entity (no provenance) | Quick additions without a source doc |
|
||||
| `skill.add_edge(source_name, target_name, relation, confidence)` | Add single relation | Quick additions without a source doc |
|
||||
| `skill.query(query)` | Graph-only retrieval → subgraph | Structural queries |
|
||||
| `skill.query_with_evidence(query)` | Graph + provenance → subgraph + source chunks | Queries requiring citations |
|
||||
| `wiki_store.write_page(category, title, content, summary)` | Write/update a wiki page | After every ingest; after answering queries |
|
||||
| `wiki_store.read_page(category, title)` | Read a wiki page | Before answering; for cross-referencing |
|
||||
| `wiki_store.search_wiki(query)` | Keyword search across wiki | Fast path before graph traversal |
|
||||
| `wiki_store.list_pages(category)` | List all wiki pages | Getting an overview |
|
||||
| `wiki_store.get_log(last_n)` | Read recent operations | Understanding wiki history |
|
||||
| `wiki_store.lint_wiki()` | Health check | Periodic maintenance |
|
||||
| `documents_store.list_documents()` | List all ingested raw sources | Audit / provenance checking |
|
||||
| `documents_store.search_chunks(query)` | Chunk-level search | Finding specific evidence |
|
||||
|
||||
---
|
||||
|
||||
## Design Philosophy
|
||||
|
||||
> "The wiki is a persistent, compounding artifact. The cross-references are already there. The synthesis already reflects everything you've read." — Karpathy
|
||||
|
||||
| Layer | What Happens | Who Owns It |
|
||||
|-------|-----------|-------------|
|
||||
| **LLM Reasoning** | Extraction, synthesis, writing wiki pages | Agent (.md guidance files) |
|
||||
| **Wiki Persistence** | Index, log, file I/O | `wiki_store.py` |
|
||||
| **Graph Persistence** | Dedup, index, BFS traverse | `graph_store.py`, `retrieval_engine.py` |
|
||||
| **Raw Source Storage** | Immutable docs + chunks + provenance | `documents_store.py` |
|
||||
|
||||
The human curates sources and asks questions. The LLM writes the wiki, extracts the graph, and answers with citations. Python handles all bookkeeping.
|
||||
|
||||
@@ -0,0 +1,196 @@
|
||||
# Ingestion Instructions
|
||||
|
||||
This file defines how the agent extracts entities and relations from a raw document.
|
||||
|
||||
---
|
||||
|
||||
## Step 1: Read the Document
|
||||
|
||||
Read the provided text carefully. Identify:
|
||||
- **Entities**: noun phrases that refer to real-world objects, systems, components, actors, concepts, or events.
|
||||
- **Relations**: verb phrases that describe how one entity affects, contains, causes, uses, or is related to another.
|
||||
|
||||
---
|
||||
|
||||
## Step 2: Extract Entities
|
||||
|
||||
For each entity:
|
||||
- Record its **name** (normalized: lowercase, strip leading/trailing whitespace)
|
||||
- Assign a **type**: a short label (1–3 words) that categorizes the entity
|
||||
|
||||
### Entity Type Examples
|
||||
|
||||
| Entity Name | Suggested Type |
|
||||
|-------------|---------------|
|
||||
| Python interpreter | software |
|
||||
| memory leak | issue |
|
||||
| operating system | system |
|
||||
| database | infrastructure |
|
||||
| user | actor |
|
||||
| API endpoint | interface |
|
||||
| server | infrastructure |
|
||||
|
||||
**Rules:**
|
||||
- Types must be general enough to reuse across documents
|
||||
- Do NOT create unique types per entity (e.g., avoid `python-interpreter-type`)
|
||||
- Use `ontology.md` normalization rules to canonicalize types
|
||||
|
||||
---
|
||||
|
||||
## Step 3: Extract Relations
|
||||
|
||||
For each pair of entities with an explicit connection in the text:
|
||||
- Record the **source** entity name
|
||||
- Record the **target** entity name
|
||||
- Record the **relation type**: a verb or verb phrase (normalized: lowercase)
|
||||
- Assign a **confidence** score between 0 and 1:
|
||||
- 1.0 = stated explicitly ("A causes B")
|
||||
- 0.8 = strongly implied ("A is linked to B")
|
||||
- 0.6 = weakly implied ("A may affect B")
|
||||
- < 0.6 = do NOT include
|
||||
|
||||
---
|
||||
|
||||
## Step 4: Output Format
|
||||
|
||||
Produce a JSON object in this exact format:
|
||||
|
||||
```json
|
||||
{
|
||||
"entities": [
|
||||
{ "name": "entity name", "type": "entity type", "supporting_text": "exact quote mentioning this entity" }
|
||||
],
|
||||
"relations": [
|
||||
{
|
||||
"source": "source entity name",
|
||||
"target": "target entity name",
|
||||
"type": "relation type",
|
||||
"confidence": 0.9,
|
||||
"supporting_text": "exact quote that justifies this relation"
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
The `supporting_text` field is **required for provenance**. It must be a verbatim or near-verbatim quote from the document that mentions or supports the entity/relation. This is what links graph nodes and edges back to their source.
|
||||
|
||||
---
|
||||
|
||||
## Rules
|
||||
|
||||
- All names and types must be **lowercase**
|
||||
- Only include relations where **both entities** are present in the entities list
|
||||
- Do NOT invent entities or relations not supported by the text
|
||||
- Prefer **reusing existing entity and relation types** from the ontology over creating new ones
|
||||
- One entity can appear in multiple relations (as source or target)
|
||||
- Always include `supporting_text` — this enables evidence retrieval and audit trails
|
||||
|
||||
---
|
||||
|
||||
## Step 5: Write Wiki Pages (Required)
|
||||
|
||||
After calling `skill.ingest_with_content(...)`, you MUST write wiki pages:
|
||||
|
||||
### 5a. Write a summary page for the document
|
||||
|
||||
```python
|
||||
from scripts.tools import wiki_store
|
||||
|
||||
wiki_store.write_page(
|
||||
category="summary",
|
||||
title=f"{title} Summary",
|
||||
content=f"""---
|
||||
title: {title}
|
||||
source_document: {doc_id}
|
||||
tags: [summary]
|
||||
---
|
||||
|
||||
# {title}
|
||||
|
||||
**Source:** {source}
|
||||
|
||||
## Key Claims
|
||||
|
||||
{chr(10).join(f'- [[{r["source"].replace(" ", "-")}]] {r["type"]} [[{r["target"].replace(" ", "-")}]] (confidence: {r["confidence"]})' for r in relations)}
|
||||
|
||||
## Entities
|
||||
|
||||
{chr(10).join(f'- [[{e["name"].replace(" ", "-")}]] ({e["type"]})' for e in entities)}
|
||||
|
||||
## Open Questions
|
||||
|
||||
- (Add questions from reading the document here)
|
||||
""",
|
||||
summary=f"Summary of {title}",
|
||||
)
|
||||
```
|
||||
|
||||
### 5b. Write or update entity pages
|
||||
|
||||
For each **new** entity not already in the wiki, write an entity page:
|
||||
|
||||
```python
|
||||
wiki_store.write_page(
|
||||
category="entity",
|
||||
title=entity_name,
|
||||
content=f"""---
|
||||
title: {entity_name}
|
||||
type: {entity_type}
|
||||
source_document: {doc_id}
|
||||
tags: [{entity_type}]
|
||||
---
|
||||
|
||||
# {entity_name}
|
||||
|
||||
(Description from the document or prior knowledge.)
|
||||
|
||||
## Relations
|
||||
|
||||
(List any wikilinks to related entities extracted from relations.)
|
||||
|
||||
## Mentioned in
|
||||
|
||||
- [[{doc_id}-summary]]
|
||||
""",
|
||||
summary=f"{entity_name}: {entity_type}",
|
||||
)
|
||||
```
|
||||
|
||||
For **existing** entity pages, read the current page and append new information, updated relations, or flag contradictions.
|
||||
|
||||
---
|
||||
|
||||
## Example
|
||||
|
||||
**Input document:**
|
||||
```
|
||||
System crashes due to memory leaks.
|
||||
Memory leaks occur when objects are not released.
|
||||
```
|
||||
|
||||
**Expected extraction output:**
|
||||
```json
|
||||
{
|
||||
"entities": [
|
||||
{ "name": "system crash", "type": "issue", "supporting_text": "system crashes due to memory leaks" },
|
||||
{ "name": "memory leak", "type": "issue", "supporting_text": "memory leaks occur when objects are not released" },
|
||||
{ "name": "object", "type": "component", "supporting_text": "objects are not released" }
|
||||
],
|
||||
"relations": [
|
||||
{
|
||||
"source": "memory leak",
|
||||
"target": "system crash",
|
||||
"type": "causes",
|
||||
"confidence": 1.0,
|
||||
"supporting_text": "System crashes due to memory leaks."
|
||||
},
|
||||
{
|
||||
"source": "object",
|
||||
"target": "memory leak",
|
||||
"type": "contributes to",
|
||||
"confidence": 0.9,
|
||||
"supporting_text": "Memory leaks occur when objects are not released."
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
@@ -0,0 +1,163 @@
|
||||
# Lint Instructions
|
||||
|
||||
This file defines the wiki health-check workflow.
|
||||
|
||||
Run this periodically (or after a large batch of ingests) to keep the wiki
|
||||
clean and accurate. The pattern is from Karpathy's LLM Wiki: detect contradictions,
|
||||
orphans, broken links, stale claims, and data gaps.
|
||||
|
||||
---
|
||||
|
||||
## When to Run
|
||||
|
||||
- After ingesting 5+ documents
|
||||
- When the user asks "check the wiki" or "health check"
|
||||
- When answers seem inconsistent or contradictory
|
||||
- Before a major synthesis or presentation
|
||||
|
||||
---
|
||||
|
||||
## Step 1: Run the Automated Health Check
|
||||
|
||||
```python
|
||||
from scripts.tools import wiki_store
|
||||
|
||||
issues = wiki_store.lint_wiki()
|
||||
# Returns:
|
||||
# {
|
||||
# "orphan_pages": [list of slugs in files but not in index],
|
||||
# "missing_pages": [list of slugs in index but file deleted],
|
||||
# "broken_wikilinks": {slug: [broken link targets]},
|
||||
# "isolated_pages": [slugs with no wikilinks at all],
|
||||
# }
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Step 2: Triage Each Issue Type
|
||||
|
||||
### Orphan Pages
|
||||
Pages exist on disk but are not in the index. They are invisible to search.
|
||||
**Fix**: Add them to the index or delete if stale.
|
||||
|
||||
```python
|
||||
# To add to index, re-write the page (this auto-updates the index):
|
||||
wiki_store.write_page(category="...", title="...", content=existing_content)
|
||||
|
||||
# To delete (manual step — confirm with user first):
|
||||
# rm wiki/{category}/{slug}.md
|
||||
```
|
||||
|
||||
### Missing Pages
|
||||
In the index but the file was deleted. Dangling references.
|
||||
**Fix**: Either recreate the page from knowledge or remove from index.
|
||||
|
||||
### Broken Wikilinks
|
||||
`[[slug]]` references that point to pages that don't exist.
|
||||
**Fix**: Create the missing page, or correct the link.
|
||||
|
||||
### Isolated Pages
|
||||
Pages with no `[[wikilinks]]` — they are unreachable via link traversal.
|
||||
**Fix**: Add links from/to related pages.
|
||||
|
||||
---
|
||||
|
||||
## Step 3: Check for Contradictions
|
||||
|
||||
Read the wiki index and scan for pages that might contradict each other:
|
||||
|
||||
```python
|
||||
pages = wiki_store.list_pages()
|
||||
# Returns [{slug, category, summary, date}, ...]
|
||||
```
|
||||
|
||||
Look for:
|
||||
- Same entity with conflicting `type` in different pages
|
||||
- Same relation with different direction in different pages
|
||||
- Newer ingests that update/supersede older claims
|
||||
|
||||
**When you find a contradiction:**
|
||||
- Add a `## Contradictions` section to the relevant entity/topic pages:
|
||||
```markdown
|
||||
## Contradictions
|
||||
- doc_001 says X; doc_003 says not-X — unresolved
|
||||
```
|
||||
- Flag it in the log:
|
||||
```python
|
||||
# Handled by wiki_store.write_page which auto-appends to log.md
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Step 4: Check for Stale Claims
|
||||
|
||||
Review pages ingested more than N days ago (use the `date` field from the index).
|
||||
Ask: "Has any newer document superseded this claim?"
|
||||
|
||||
**When a claim is stale:**
|
||||
- Update the page: add a `## Superseded` section or update the body.
|
||||
- Mark the old claim with _(superseded by [[newer-doc-summary]])_.
|
||||
|
||||
---
|
||||
|
||||
## Step 5: Check for Missing Cross-References
|
||||
|
||||
For each entity page, check: does it link back to all summary pages that mention it?
|
||||
For each summary page, check: does it link to all entity pages it extracted?
|
||||
|
||||
**Fix**: Read the page and add missing `[[slug]]` links.
|
||||
|
||||
---
|
||||
|
||||
## Step 6: Identify Data Gaps
|
||||
|
||||
Review entity pages that lack:
|
||||
- A proper description (just a stub)
|
||||
- Any `## Relations` section
|
||||
- Any `## Mentioned in` links
|
||||
|
||||
These are candidates for deeper research or new ingests.
|
||||
|
||||
---
|
||||
|
||||
## Step 7: Log the Lint Pass
|
||||
|
||||
```python
|
||||
# wiki_store.write_page automatically logs the activity.
|
||||
# For a manual lint summary, append to log.md via write_page on a topic:
|
||||
wiki_store.write_page(
|
||||
category="topic",
|
||||
title="Lint Pass YYYY-MM-DD",
|
||||
content="# Lint Pass\n\n## Issues Found\n\n...\n\n## Fixed\n\n...",
|
||||
summary="Lint pass results",
|
||||
)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Quick Lint Commands
|
||||
|
||||
```python
|
||||
from scripts.tools import wiki_store
|
||||
|
||||
# Full health check
|
||||
issues = wiki_store.lint_wiki()
|
||||
|
||||
# Get recent history
|
||||
log = wiki_store.get_log(last_n=10)
|
||||
|
||||
# List all pages
|
||||
all_pages = wiki_store.list_pages()
|
||||
|
||||
# Search for a concept across wiki
|
||||
results = wiki_store.search_wiki("memory leak")
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Rules
|
||||
|
||||
- NEVER delete pages without user confirmation
|
||||
- NEVER auto-resolve a contradiction — flag it for human review
|
||||
- File all lint results as a topic page in the wiki (so the history is visible)
|
||||
- Prefer adding cross-references over rewriting existing content
|
||||
@@ -0,0 +1,99 @@
|
||||
# Ontology Instructions
|
||||
|
||||
This file defines the rules for maintaining and evolving the dynamic ontology used by the Context Graph.
|
||||
|
||||
---
|
||||
|
||||
## Core Principle
|
||||
|
||||
The ontology is **NOT fixed**. Types and relations emerge from documents as they are ingested.
|
||||
However, the ontology must remain **compact, consistent, and reusable**.
|
||||
|
||||
---
|
||||
|
||||
## Entity Type Rules
|
||||
|
||||
### Normalization
|
||||
|
||||
When assigning an entity type, apply these transformations:
|
||||
1. Convert to **lowercase**
|
||||
2. Strip leading/trailing whitespace
|
||||
3. Replace underscores and hyphens with spaces
|
||||
4. Merge synonymous types using the mapping table below
|
||||
|
||||
### Synonym Mapping (Entity Types)
|
||||
|
||||
| Variant | Canonical Type |
|
||||
|---------|---------------|
|
||||
| component, module, class, function | component |
|
||||
| bug, defect, fault, error, failure | issue |
|
||||
| server, host, machine, node | infrastructure |
|
||||
| user, person, operator, admin, actor | actor |
|
||||
| app, application, service, program, software | software |
|
||||
| database, datastore, db, storage | storage |
|
||||
| api, endpoint, interface, connection | interface |
|
||||
| event, incident, occurrence, trigger | event |
|
||||
| concept, idea, principle, theory | concept |
|
||||
| process, thread, task, job, workflow | process |
|
||||
|
||||
### Adding New Types
|
||||
|
||||
If an entity does not match any existing type:
|
||||
- Create a **new type** if it is genuinely distinct
|
||||
- Keep the label short (1–3 words, lowercase)
|
||||
- Consider whether an existing type is close enough before creating a new one
|
||||
|
||||
### Constraint
|
||||
|
||||
- Maximum ~50 distinct entity types across the entire ontology
|
||||
- If the limit is approached, merge similar types rather than creating new ones
|
||||
|
||||
---
|
||||
|
||||
## Relation Type Rules
|
||||
|
||||
### Normalization
|
||||
|
||||
When assigning a relation type:
|
||||
1. Convert to **lowercase**
|
||||
2. Strip whitespace
|
||||
3. Use verb phrases in **present tense** (e.g., "causes", "contains", "uses")
|
||||
4. Merge synonyms using the mapping table below
|
||||
|
||||
### Synonym Mapping (Relation Types)
|
||||
|
||||
| Variant | Canonical Relation |
|
||||
|---------|-------------------|
|
||||
| triggers, leads to, results in, produces | causes |
|
||||
| is part of, belongs to, lives in, sits in | contains |
|
||||
| depends on, requires, needs | depends on |
|
||||
| uses, calls, invokes, consumes | uses |
|
||||
| affects, impacts, influences | affects |
|
||||
| creates, instantiates, spawns | creates |
|
||||
| connects to, links to, references | connects to |
|
||||
| inherits from, extends, subclasses | extends |
|
||||
| reads from, queries, fetches | reads from |
|
||||
| writes to, stores in, persists to | writes to |
|
||||
|
||||
### Adding New Relations
|
||||
|
||||
- Only add new relation types if no existing type accurately describes the relationship
|
||||
- Prefer canonical relations over creating new ones
|
||||
|
||||
---
|
||||
|
||||
## Ontology Update Protocol
|
||||
|
||||
When processing extracted entities/relations from `ingestion.md`:
|
||||
|
||||
1. For each entity type:
|
||||
- Run through the synonym mapping
|
||||
- Call `ontology_store.normalize_type(type_name)` to get the canonical form
|
||||
- Call `ontology_store.add_type(canonical_type)` to register it
|
||||
|
||||
2. For each relation type:
|
||||
- Run through the synonym mapping
|
||||
- Call `ontology_store.normalize_relation(relation_name)` to get the canonical form
|
||||
- Call `ontology_store.add_relation(canonical_relation)` to register it
|
||||
|
||||
3. Use the **canonical** type/relation names when creating nodes and edges in the graph.
|
||||
@@ -0,0 +1,163 @@
|
||||
# Retrieval Instructions
|
||||
|
||||
This file defines how the agent answers queries using the two-layer retrieval strategy:
|
||||
**wiki-first** (fast path), then **graph traversal with evidence** (deep path).
|
||||
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
Retrieval is a 7-step process:
|
||||
|
||||
1. Parse the query
|
||||
2. **Check the wiki first** (fast path)
|
||||
3. Find seed nodes in the graph
|
||||
4. Expand the graph via BFS
|
||||
5. Prune noisy nodes
|
||||
6. Build the subgraph with provenance
|
||||
7. Return structured context
|
||||
|
||||
---
|
||||
|
||||
## Step 1: Parse the Query
|
||||
|
||||
Read the query string and identify:
|
||||
- **Key noun phrases**: potential entity names (e.g., "system crash", "memory leak")
|
||||
- **Keywords**: individual meaningful words (e.g., "crash", "leak", "memory")
|
||||
- Normalize all terms to **lowercase**
|
||||
|
||||
Ignore stopwords (e.g., "the", "a", "is", "why", "does", "how", "what").
|
||||
|
||||
---
|
||||
|
||||
## Step 2: Check the Wiki First (Fast Path)
|
||||
|
||||
Before touching the graph, search the wiki. The wiki contains compiled knowledge —
|
||||
cross-references already resolved, contradictions flagged, syntheses written.
|
||||
|
||||
```python
|
||||
from scripts.tools import wiki_store
|
||||
|
||||
results = wiki_store.search_wiki(query)
|
||||
```
|
||||
|
||||
For each relevant result, read the page:
|
||||
|
||||
```python
|
||||
content = wiki_store.read_page_by_slug(result["slug"])
|
||||
```
|
||||
|
||||
**If the wiki has a sufficient answer:**
|
||||
- Synthesize from wiki pages.
|
||||
- Cite the source pages (e.g., "According to [[memory-leak]] and [[system-crash]]...").
|
||||
- File the answer as a new wiki topic page if it's valuable and not already captured:
|
||||
```python
|
||||
wiki_store.write_page(category="topic", title="Why System Crashes", content=..., summary=...)
|
||||
```
|
||||
- **Return early** — no graph traversal needed.
|
||||
|
||||
**If the wiki answer is incomplete or missing:** proceed to Step 3.
|
||||
|
||||
---
|
||||
|
||||
## Step 3: Find Seed Nodes
|
||||
|
||||
Call `index_store.search(query)` with the original query string.
|
||||
|
||||
This returns node IDs matching entity names or keywords.
|
||||
|
||||
If no seed nodes are found:
|
||||
- Try searching with individual keywords from Step 1.
|
||||
- If still no results, return an empty subgraph: "No relevant entities found."
|
||||
|
||||
---
|
||||
|
||||
## Step 4: Expand the Graph (BFS)
|
||||
|
||||
Call `retrieval_engine.retrieve(seed_node_ids, depth=2)`.
|
||||
|
||||
BFS from seed nodes:
|
||||
- **Depth 1**: direct neighbors
|
||||
- **Depth 2**: neighbors of neighbors
|
||||
|
||||
Rules:
|
||||
- Only traverse edges with confidence ≥ MIN_CONFIDENCE (from config.py)
|
||||
- Do NOT traverse beyond depth 2
|
||||
- Collect all visited node IDs
|
||||
|
||||
---
|
||||
|
||||
## Step 5: Prune Nodes
|
||||
|
||||
- Limit total nodes to MAX_NODES (from config.py)
|
||||
- Prioritize:
|
||||
1. Seed nodes (always include)
|
||||
2. Nodes at depth 1
|
||||
3. Nodes at depth 2 (as space allows)
|
||||
- Remove nodes only weakly connected (edge confidence < MIN_CONFIDENCE)
|
||||
|
||||
---
|
||||
|
||||
## Step 6: Build the Subgraph with Provenance
|
||||
|
||||
For a standard query, call:
|
||||
|
||||
```python
|
||||
subgraph = skill.query(query)
|
||||
# Returns: {"nodes": {node_id: {name, type, source_document, source_chunks}},
|
||||
# "edges": [{source, target, type, confidence, source_document, supporting_text, chunk_id}]}
|
||||
```
|
||||
|
||||
For queries requiring evidence (citations, fact-checking), call:
|
||||
|
||||
```python
|
||||
result = skill.query_with_evidence(query)
|
||||
# Returns:
|
||||
# {
|
||||
# "query": str,
|
||||
# "subgraph": {"nodes": {...}, "edges": [...]},
|
||||
# "supporting_documents": [
|
||||
# {
|
||||
# "doc_id": str,
|
||||
# "doc_title": str,
|
||||
# "supporting_chunks": [{"chunk_id": str, "text": str}, ...]
|
||||
# }
|
||||
# ],
|
||||
# "evidence_chain": "memory leak --[causes]--> system crash"
|
||||
# }
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Step 7: Return Structured Context
|
||||
|
||||
Return the result with:
|
||||
- **Subgraph**: nodes + edges (the graph answer)
|
||||
- **Supporting documents**: source chunks that prove each relation
|
||||
- **Evidence chain**: human-readable path summary
|
||||
- **Wiki references**: links to relevant wiki pages found in Step 2
|
||||
|
||||
**If valuable, file the answer back into the wiki:**
|
||||
|
||||
```python
|
||||
wiki_store.write_page(
|
||||
category="topic",
|
||||
title=query,
|
||||
content=f"# {query}\n\n**Evidence chain:** {result['evidence_chain']}\n\n...",
|
||||
summary="...",
|
||||
)
|
||||
```
|
||||
|
||||
This way, future queries on the same topic find the answer instantly in the wiki.
|
||||
|
||||
---
|
||||
|
||||
## Rules
|
||||
|
||||
- NEVER fabricate nodes or edges not present in the graph
|
||||
- NEVER traverse deeper than depth 2
|
||||
- ALWAYS check the wiki before the graph (wiki-first)
|
||||
- Always include seed nodes in the result, even if they have no edges
|
||||
- Prefer edges with higher confidence when pruning
|
||||
- File valuable answers back into the wiki as topic pages
|
||||
- Return an empty subgraph (not an error) if no relevant nodes are found
|
||||
@@ -0,0 +1,23 @@
|
||||
"""
|
||||
config.py — Global configuration constants for the Context Graph Skill.
|
||||
|
||||
Data directories are resolved from environment variables so the skill can be
|
||||
used from any project without writing data inside the skill package itself.
|
||||
|
||||
MINI_CONTEXT_GRAPH_DATA_DIR — where graph.json, index.json, etc. live
|
||||
MINI_CONTEXT_GRAPH_WIKI_DIR — where wiki pages, index.md, and log.md live
|
||||
|
||||
Both default to subdirectories of the current working directory when the env
|
||||
vars are not set, so data ends up in the consuming project's directory.
|
||||
"""
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
_BASE = Path(os.environ.get("MINI_CONTEXT_GRAPH_BASE", str(Path.cwd())))
|
||||
DATA_DIR = Path(os.environ.get("MINI_CONTEXT_GRAPH_DATA_DIR", str(_BASE / "data")))
|
||||
WIKI_DIR = Path(os.environ.get("MINI_CONTEXT_GRAPH_WIKI_DIR", str(_BASE / "wiki")))
|
||||
|
||||
MAX_GRAPH_DEPTH: int = 2
|
||||
MIN_CONFIDENCE: float = 0.6
|
||||
MAX_NODES: int = 50
|
||||
@@ -0,0 +1,296 @@
|
||||
"""
|
||||
contextgraph.py — Main interface for the Context Graph Skill.
|
||||
|
||||
This file is orchestration-only. All LLM reasoning lives in the .md files.
|
||||
Python here only wires together the deterministic storage and retrieval tools.
|
||||
|
||||
Agent usage:
|
||||
- ingest(): agent reads ingestion.md + ontology.md, extracts entities/relations,
|
||||
then calls the tool methods directly.
|
||||
- query(): agent reads retrieval.md, calls index_store.search + retrieval_engine.retrieve,
|
||||
then calls graph_store.get_subgraph and returns the result.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent))
|
||||
|
||||
import config
|
||||
from tools import graph_store, index_store, ontology_store, retrieval_engine, documents_store
|
||||
|
||||
|
||||
class ContextGraphSkill:
|
||||
|
||||
def ingest(self, documents: list[str]) -> None:
|
||||
"""
|
||||
Orchestration entry point for ingesting documents into the context graph.
|
||||
|
||||
The agent (Copilot) MUST:
|
||||
1. Read ingestion.md to understand entity/relation extraction rules.
|
||||
2. Read ontology.md to apply type normalization.
|
||||
3. For each document, produce a JSON with entities + relations.
|
||||
4. For each entity:
|
||||
- ontology_store.add_type(entity["type"])
|
||||
- node_id = graph_store.add_node(entity["name"], entity["type"])
|
||||
- index_store.add_entity(entity["name"], node_id)
|
||||
5. For each relation (if confidence >= MIN_CONFIDENCE):
|
||||
- ontology_store.add_relation(relation["type"])
|
||||
- source_id = graph_store.find_node_by_name(relation["source"])
|
||||
- target_id = graph_store.find_node_by_name(relation["target"])
|
||||
- graph_store.add_edge(source_id, target_id, relation["type"], relation["confidence"])
|
||||
|
||||
This method does NOT call any LLM. It documents the agent contract only.
|
||||
"""
|
||||
raise NotImplementedError(
|
||||
"ingest() must be driven by the Copilot agent following ingestion.md. "
|
||||
"Call the tool methods directly after LLM extraction."
|
||||
)
|
||||
|
||||
def query(self, query: str) -> dict:
|
||||
"""
|
||||
Orchestration entry point for retrieving a subgraph for a query.
|
||||
|
||||
The agent (Copilot) MUST:
|
||||
1. Read retrieval.md to understand the retrieval strategy.
|
||||
2. Call index_store.search(query) to get seed node_ids.
|
||||
3. Call retrieval_engine.retrieve(seed_ids, depth=MAX_GRAPH_DEPTH) to expand.
|
||||
4. Call graph_store.get_subgraph(node_ids) to build the result.
|
||||
5. Return the subgraph dict.
|
||||
|
||||
This method does NOT call any LLM. It documents the agent contract only.
|
||||
Returns an empty subgraph if called directly.
|
||||
"""
|
||||
seed_ids = index_store.search(query)
|
||||
if not seed_ids:
|
||||
return {"nodes": {}, "edges": []}
|
||||
|
||||
node_ids = retrieval_engine.retrieve(
|
||||
seed_ids,
|
||||
depth=config.MAX_GRAPH_DEPTH,
|
||||
min_confidence=config.MIN_CONFIDENCE,
|
||||
max_nodes=config.MAX_NODES,
|
||||
)
|
||||
return graph_store.get_subgraph(node_ids)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Convenience wrappers — agents may call these directly
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def add_node(self, name: str, node_type: str) -> str:
|
||||
"""Add a node to the graph and index. Returns node_id."""
|
||||
canonical_type = ontology_store.normalize_type(node_type)
|
||||
ontology_store.add_type(canonical_type)
|
||||
node_id = graph_store.add_node(name, canonical_type)
|
||||
index_store.add_entity(name, node_id)
|
||||
return node_id
|
||||
|
||||
def add_edge(
|
||||
self, source_name: str, target_name: str, relation: str, confidence: float
|
||||
) -> None:
|
||||
"""Add an edge between two nodes (by name) if both exist and confidence qualifies."""
|
||||
if confidence < config.MIN_CONFIDENCE:
|
||||
return
|
||||
|
||||
source_id = graph_store.find_node_by_name(source_name)
|
||||
target_id = graph_store.find_node_by_name(target_name)
|
||||
if source_id is None or target_id is None:
|
||||
return
|
||||
|
||||
canonical_relation = ontology_store.normalize_relation(relation)
|
||||
ontology_store.add_relation(canonical_relation)
|
||||
graph_store.add_edge(source_id, target_id, canonical_relation, confidence)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# LLM Wiki + RAG methods — store raw content & provenance
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def ingest_with_content(
|
||||
self,
|
||||
doc_id: str,
|
||||
title: str,
|
||||
source: str,
|
||||
raw_content: str,
|
||||
entities: list[dict],
|
||||
relations: list[dict],
|
||||
) -> dict:
|
||||
"""
|
||||
Full RAG ingestion: stores raw document + chunks, then wires provenance
|
||||
links from each graph node/edge back to source chunks.
|
||||
|
||||
The agent MUST:
|
||||
1. Read the raw_content.
|
||||
2. Read ingestion.md and ontology.md for extraction rules.
|
||||
3. Extract entities and relations (LLM reasoning step).
|
||||
4. Call this method with the results.
|
||||
|
||||
Args:
|
||||
doc_id: Stable document identifier (e.g. "doc_001").
|
||||
title: Human-readable document title.
|
||||
source: Origin path or URL (immutable, never modified).
|
||||
raw_content: Full text of the document.
|
||||
entities: List of dicts: [{name, type, supporting_text?}, ...]
|
||||
relations: List of dicts: [{source, target, type, confidence,
|
||||
supporting_text?, chunk_hint?}, ...]
|
||||
|
||||
Returns:
|
||||
Summary dict: {doc_id, chunk_count, nodes_added, edges_added}
|
||||
"""
|
||||
# Step 1: Store raw document and auto-chunk
|
||||
doc = documents_store.add_document(doc_id, title, source, raw_content)
|
||||
chunks = doc["chunks"]
|
||||
|
||||
def _find_best_chunk(text: str) -> str | None:
|
||||
"""Find the chunk whose text most overlaps with the given span."""
|
||||
if not text or not chunks:
|
||||
return None
|
||||
text_lower = text.lower()
|
||||
best_chunk_id = None
|
||||
best_score = 0
|
||||
for chunk in chunks:
|
||||
if text_lower in chunk["text"].lower():
|
||||
return chunk["chunk_id"]
|
||||
# Fallback: count overlapping words
|
||||
words_text = set(text_lower.split())
|
||||
words_chunk = set(chunk["text"].lower().split())
|
||||
score = len(words_text & words_chunk)
|
||||
if score > best_score:
|
||||
best_score = score
|
||||
best_chunk_id = chunk["chunk_id"]
|
||||
return best_chunk_id
|
||||
|
||||
nodes_added = 0
|
||||
# Step 2: Ingest entities with provenance
|
||||
for entity in entities:
|
||||
supporting = entity.get("supporting_text", "")
|
||||
chunk_id = _find_best_chunk(supporting)
|
||||
chunk_ids = [chunk_id] if chunk_id else []
|
||||
|
||||
canonical_type = ontology_store.normalize_type(entity["type"])
|
||||
ontology_store.add_type(canonical_type)
|
||||
node_id = graph_store.add_node(
|
||||
entity["name"],
|
||||
canonical_type,
|
||||
source_document=doc_id,
|
||||
source_chunks=chunk_ids,
|
||||
)
|
||||
index_store.add_entity(entity["name"], node_id)
|
||||
nodes_added += 1
|
||||
|
||||
edges_added = 0
|
||||
# Step 3: Ingest relations with provenance
|
||||
for rel in relations:
|
||||
if rel.get("confidence", 0) < config.MIN_CONFIDENCE:
|
||||
continue
|
||||
|
||||
supporting = rel.get("supporting_text", "")
|
||||
chunk_id = _find_best_chunk(supporting) or rel.get("chunk_hint")
|
||||
|
||||
source_id = graph_store.find_node_by_name(rel["source"])
|
||||
target_id = graph_store.find_node_by_name(rel["target"])
|
||||
if source_id is None or target_id is None:
|
||||
continue
|
||||
|
||||
canonical_relation = ontology_store.normalize_relation(rel["type"])
|
||||
ontology_store.add_relation(canonical_relation)
|
||||
graph_store.add_edge(
|
||||
source_id,
|
||||
target_id,
|
||||
canonical_relation,
|
||||
rel["confidence"],
|
||||
source_document=doc_id,
|
||||
supporting_text=supporting or None,
|
||||
chunk_id=chunk_id,
|
||||
)
|
||||
edges_added += 1
|
||||
|
||||
return {
|
||||
"doc_id": doc_id,
|
||||
"chunk_count": len(chunks),
|
||||
"nodes_added": nodes_added,
|
||||
"edges_added": edges_added,
|
||||
}
|
||||
|
||||
def query_with_evidence(self, query: str) -> dict:
|
||||
"""
|
||||
Query the graph and return the subgraph together with supporting
|
||||
source documents and chunks (evidence chain).
|
||||
|
||||
Returns:
|
||||
{
|
||||
"query": str,
|
||||
"subgraph": {"nodes": {...}, "edges": [...]},
|
||||
"supporting_documents": [
|
||||
{
|
||||
"doc_id": str,
|
||||
"doc_title": str,
|
||||
"supporting_chunks": [{"chunk_id": str, "text": str}, ...]
|
||||
}
|
||||
],
|
||||
"evidence_chain": str # human-readable summary path
|
||||
}
|
||||
"""
|
||||
subgraph = self.query(query)
|
||||
if not subgraph["nodes"]:
|
||||
return {
|
||||
"query": query,
|
||||
"subgraph": subgraph,
|
||||
"supporting_documents": [],
|
||||
"evidence_chain": "No matching nodes found.",
|
||||
}
|
||||
|
||||
# Collect all provenance pointers from nodes and edges
|
||||
docs_chunks: dict[str, list[str]] = {} # doc_id -> [chunk_ids]
|
||||
|
||||
for node in subgraph["nodes"].values():
|
||||
doc_id = node.get("source_document")
|
||||
if doc_id:
|
||||
docs_chunks.setdefault(doc_id, [])
|
||||
docs_chunks[doc_id].extend(node.get("source_chunks") or [])
|
||||
|
||||
for edge in subgraph["edges"]:
|
||||
doc_id = edge.get("source_document")
|
||||
if doc_id:
|
||||
docs_chunks.setdefault(doc_id, [])
|
||||
if edge.get("chunk_id"):
|
||||
docs_chunks[doc_id].append(edge["chunk_id"])
|
||||
|
||||
# Resolve chunk texts from documents_store
|
||||
supporting_documents = []
|
||||
for doc_id, chunk_ids in docs_chunks.items():
|
||||
doc = documents_store.get_document(doc_id)
|
||||
if doc is None:
|
||||
continue
|
||||
seen = set()
|
||||
chunks_out = []
|
||||
for cid in chunk_ids:
|
||||
if cid in seen:
|
||||
continue
|
||||
seen.add(cid)
|
||||
chunk = documents_store.get_chunk(cid)
|
||||
if chunk:
|
||||
chunks_out.append({"chunk_id": cid, "text": chunk["text"]})
|
||||
if chunks_out:
|
||||
supporting_documents.append({
|
||||
"doc_id": doc_id,
|
||||
"doc_title": doc["title"],
|
||||
"supporting_chunks": chunks_out,
|
||||
})
|
||||
|
||||
# Build a simple evidence chain string
|
||||
chain_parts = []
|
||||
for edge in subgraph["edges"]:
|
||||
src_node = subgraph["nodes"].get(edge["source"], {})
|
||||
tgt_node = subgraph["nodes"].get(edge["target"], {})
|
||||
src_name = src_node.get("name", edge["source"])
|
||||
tgt_name = tgt_node.get("name", edge["target"])
|
||||
chain_parts.append(f"{src_name} --[{edge['type']}]--> {tgt_name}")
|
||||
evidence_chain = " | ".join(chain_parts) if chain_parts else "No edges in subgraph."
|
||||
|
||||
return {
|
||||
"query": query,
|
||||
"subgraph": subgraph,
|
||||
"supporting_documents": supporting_documents,
|
||||
"evidence_chain": evidence_chain,
|
||||
}
|
||||
@@ -0,0 +1,198 @@
|
||||
"""
|
||||
template_agent_workflow.py — Template agent script for ingesting + querying the context graph.
|
||||
|
||||
This script demonstrates the complete workflow an agent should follow:
|
||||
1. Read markdown guidance files
|
||||
2. Extract entities/relations via LLM reasoning
|
||||
3. Call Python methods to persist
|
||||
4. Query the graph
|
||||
5. Handle errors gracefully
|
||||
|
||||
Copy and adapt this template for your agent implementation.
|
||||
"""
|
||||
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Add tools to path
|
||||
sys.path.insert(0, str(Path(__file__).parent))
|
||||
|
||||
from contextgraph import ContextGraphSkill
|
||||
|
||||
|
||||
def ingest_document(skill: ContextGraphSkill, document: str) -> dict:
|
||||
"""
|
||||
Step 1: Agent reads ingestion.md and ontology.md
|
||||
Step 2: Agent uses LLM to extract entities and relations
|
||||
Step 3: Call Python methods to persist (mimicked here with static extraction)
|
||||
|
||||
In a real agent, replace the static extraction with LLM calls.
|
||||
"""
|
||||
print(f"\n[INGEST] Processing document:\n{document}\n")
|
||||
|
||||
# --- STEP 1 & 2: LLM EXTRACTION PHASE (Guided by ingestion.md + ontology.md) ---
|
||||
# In a real agent, this would use LLM reasoning.
|
||||
# For now, we'll mock an extraction result:
|
||||
|
||||
extraction_result = {
|
||||
"entities": [
|
||||
{"name": "memory leak", "type": "issue"},
|
||||
{"name": "system crash", "type": "issue"},
|
||||
{"name": "object", "type": "component"},
|
||||
],
|
||||
"relations": [
|
||||
{
|
||||
"source": "memory leak",
|
||||
"target": "system crash",
|
||||
"type": "causes",
|
||||
"confidence": 1.0,
|
||||
},
|
||||
{
|
||||
"source": "object",
|
||||
"target": "memory leak",
|
||||
"type": "contributes to",
|
||||
"confidence": 0.9,
|
||||
},
|
||||
],
|
||||
}
|
||||
|
||||
print(f"[LLM] Extracted entities + relations:")
|
||||
print(json.dumps(extraction_result, indent=2))
|
||||
|
||||
# --- STEP 3: PERSIST PHASE (Call Python methods) ---
|
||||
errors = []
|
||||
added_nodes = {}
|
||||
|
||||
for entity in extraction_result["entities"]:
|
||||
try:
|
||||
node_id = skill.add_node(entity["name"], entity["type"])
|
||||
added_nodes[entity["name"]] = node_id
|
||||
print(f" ✓ Added node: {entity['name']} (id: {node_id}, type: {entity['type']})")
|
||||
except Exception as e:
|
||||
errors.append(f"Failed to add node {entity['name']}: {e}")
|
||||
print(f" ✗ Error adding node {entity['name']}: {e}")
|
||||
|
||||
for relation in extraction_result["relations"]:
|
||||
# Validate both endpoints exist
|
||||
if relation["source"] not in added_nodes or relation["target"] not in added_nodes:
|
||||
error_msg = f"Cannot add edge: source or target missing"
|
||||
errors.append(error_msg)
|
||||
print(f" ✗ Skip edge {relation['source']} → {relation['target']}: {error_msg}")
|
||||
continue
|
||||
|
||||
# Validate confidence threshold
|
||||
if relation["confidence"] < 0.6:
|
||||
error_msg = f"Confidence {relation['confidence']} < 0.6 (minimum threshold)"
|
||||
errors.append(error_msg)
|
||||
print(f" ✗ Skip edge {relation['source']} → {relation['target']}: {error_msg}")
|
||||
continue
|
||||
|
||||
try:
|
||||
skill.add_edge(
|
||||
source_name=relation["source"],
|
||||
target_name=relation["target"],
|
||||
relation=relation["type"],
|
||||
confidence=relation["confidence"],
|
||||
)
|
||||
print(
|
||||
f" ✓ Added edge: {relation['source']} "
|
||||
f"--[{relation['type']}]→ {relation['target']} "
|
||||
f"(confidence: {relation['confidence']})"
|
||||
)
|
||||
except Exception as e:
|
||||
errors.append(f"Failed to add edge {relation['source']} → {relation['target']}: {e}")
|
||||
print(f" ✗ Error adding edge: {e}")
|
||||
|
||||
return {
|
||||
"success": len(errors) == 0,
|
||||
"nodes_added": len(added_nodes),
|
||||
"edges_added": len(extraction_result["relations"]) - len(
|
||||
[e for e in errors if "skip edge" in e.lower()]
|
||||
),
|
||||
"errors": errors,
|
||||
}
|
||||
|
||||
|
||||
def query_graph(skill: ContextGraphSkill, query: str) -> dict:
|
||||
"""
|
||||
Query the graph for context to answer the user's question.
|
||||
|
||||
Step 1: Read retrieval.md
|
||||
Step 2: Call skill.query() which internally handles BFS + subgraph extraction
|
||||
Step 3: Return structured context
|
||||
"""
|
||||
print(f"\n[QUERY] {query}\n")
|
||||
|
||||
try:
|
||||
subgraph = skill.query(query)
|
||||
|
||||
if not subgraph["nodes"]:
|
||||
print(" ℹ No relevant entities found in graph.")
|
||||
return {
|
||||
"success": True,
|
||||
"query": query,
|
||||
"subgraph": subgraph,
|
||||
"nodes_found": 0,
|
||||
"edges_found": 0,
|
||||
}
|
||||
|
||||
print(f" ✓ Retrieved subgraph with {len(subgraph['nodes'])} nodes, {len(subgraph['edges'])} edges")
|
||||
print(f"\n Nodes:")
|
||||
for node_id, node in subgraph["nodes"].items():
|
||||
print(f" - {node['name']} (type: {node['type']}, id: {node_id})")
|
||||
|
||||
print(f"\n Edges:")
|
||||
for edge in subgraph["edges"]:
|
||||
source_name = subgraph["nodes"][edge["source"]]["name"]
|
||||
target_name = subgraph["nodes"][edge["target"]]["name"]
|
||||
print(
|
||||
f" - {source_name} --[{edge['type']}]→ {target_name} "
|
||||
f"(confidence: {edge['confidence']})"
|
||||
)
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"query": query,
|
||||
"subgraph": subgraph,
|
||||
"nodes_found": len(subgraph["nodes"]),
|
||||
"edges_found": len(subgraph["edges"]),
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"Query failed: {e}"
|
||||
print(f" ✗ {error_msg}")
|
||||
return {"success": False, "query": query, "error": error_msg}
|
||||
|
||||
|
||||
def main():
|
||||
"""Demo: ingest a document, then query the graph."""
|
||||
skill = ContextGraphSkill()
|
||||
|
||||
# ===== INGESTION =====
|
||||
document = """
|
||||
System crashes due to memory leaks.
|
||||
Memory leaks occur when objects are not released.
|
||||
"""
|
||||
|
||||
result = ingest_document(skill, document)
|
||||
print(f"\n[INGEST RESULT] Nodes added: {result['nodes_added']}, " f"Edges added: {result['edges_added']}")
|
||||
if result["errors"]:
|
||||
print(f"Errors: {result['errors']}")
|
||||
|
||||
# ===== RETRIEVAL =====
|
||||
queries = [
|
||||
"Why does the system crash?",
|
||||
"What causes memory leaks?",
|
||||
]
|
||||
|
||||
for query in queries:
|
||||
result = query_graph(skill, query)
|
||||
if result["success"]:
|
||||
print(f" Nodes found: {result['nodes_found']}, Edges found: {result['edges_found']}")
|
||||
else:
|
||||
print(f" Error: {result['error']}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,191 @@
|
||||
"""
|
||||
documents_store.py — Persistent storage for raw documents and chunks (RAG layer).
|
||||
|
||||
Inspired by Karpathy's LLM Wiki pattern: raw sources are immutable and stored
|
||||
as the ground truth. Chunks are the retrieval unit; provenance links tie graph
|
||||
nodes/edges back to specific chunks.
|
||||
|
||||
Handles:
|
||||
- Storing raw documents with metadata
|
||||
- Chunking documents into overlapping text windows
|
||||
- Retrieving chunks by id or by keyword search
|
||||
- Persisting to data/documents.json
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import uuid
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
import config
|
||||
|
||||
_DATA_DIR = Path(os.environ.get("MINI_CONTEXT_GRAPH_DATA_DIR", str(config.DATA_DIR)))
|
||||
_DOCS_FILE = _DATA_DIR / "documents.json"
|
||||
|
||||
_CHUNK_SIZE = 500 # characters per chunk
|
||||
_CHUNK_OVERLAP = 100 # overlap between consecutive chunks
|
||||
|
||||
_STOPWORDS = frozenset([
|
||||
"a", "an", "the", "is", "are", "was", "were", "be", "been", "being",
|
||||
"have", "has", "had", "do", "does", "did", "will", "would", "could",
|
||||
"should", "may", "might", "shall", "can", "to", "of", "in", "on",
|
||||
"at", "by", "for", "with", "from", "and", "or", "but", "not", "it",
|
||||
"its", "this", "that", "these", "those", "i", "you", "he", "she",
|
||||
"we", "they", "what", "which", "who", "how", "why", "when", "where",
|
||||
])
|
||||
|
||||
|
||||
def _load() -> dict:
|
||||
if _DOCS_FILE.exists():
|
||||
with open(_DOCS_FILE, "r") as f:
|
||||
return json.load(f)
|
||||
return {"documents": {}}
|
||||
|
||||
|
||||
def _save(store: dict) -> None:
|
||||
_DATA_DIR.mkdir(parents=True, exist_ok=True)
|
||||
with open(_DOCS_FILE, "w") as f:
|
||||
json.dump(store, f, indent=2)
|
||||
|
||||
|
||||
def _tokenize(text: str) -> list[str]:
|
||||
tokens = re.findall(r"[a-z0-9]+", text.lower())
|
||||
return [t for t in tokens if t not in _STOPWORDS and len(t) > 1]
|
||||
|
||||
|
||||
def _chunk_text(content: str, chunk_size: int = _CHUNK_SIZE, overlap: int = _CHUNK_OVERLAP) -> list[str]:
|
||||
"""Split content into overlapping character windows."""
|
||||
chunks = []
|
||||
start = 0
|
||||
while start < len(content):
|
||||
end = start + chunk_size
|
||||
chunks.append(content[start:end].strip())
|
||||
if end >= len(content):
|
||||
break
|
||||
start += chunk_size - overlap
|
||||
return [c for c in chunks if c]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Public API
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def add_document(
|
||||
doc_id: str,
|
||||
title: str,
|
||||
source: str,
|
||||
content: str,
|
||||
) -> dict:
|
||||
"""
|
||||
Store a raw document and auto-generate chunks.
|
||||
|
||||
Args:
|
||||
doc_id: Caller-supplied stable identifier (e.g. "doc_001" or a filename).
|
||||
title: Human-readable title.
|
||||
source: Origin path/URL (immutable provenance pointer).
|
||||
content: Full raw text to store and chunk.
|
||||
|
||||
Returns:
|
||||
The stored document dict including generated chunk_ids.
|
||||
"""
|
||||
store = _load()
|
||||
|
||||
# Idempotent: return existing doc if already stored
|
||||
if doc_id in store["documents"]:
|
||||
return store["documents"][doc_id]
|
||||
|
||||
raw_chunks = _chunk_text(content)
|
||||
chunks = []
|
||||
for i, text in enumerate(raw_chunks):
|
||||
chunks.append({
|
||||
"chunk_id": f"{doc_id}_chunk_{i:03d}",
|
||||
"index": i,
|
||||
"text": text,
|
||||
})
|
||||
|
||||
doc = {
|
||||
"id": doc_id,
|
||||
"title": title,
|
||||
"source": source,
|
||||
"content": content,
|
||||
"chunks": chunks,
|
||||
"ingestion_date": datetime.now(timezone.utc).isoformat(),
|
||||
}
|
||||
store["documents"][doc_id] = doc
|
||||
_save(store)
|
||||
return doc
|
||||
|
||||
|
||||
def get_document(doc_id: str) -> dict | None:
|
||||
"""Return the full document record or None if not found."""
|
||||
store = _load()
|
||||
return store["documents"].get(doc_id)
|
||||
|
||||
|
||||
def get_chunk(chunk_id: str) -> dict | None:
|
||||
"""Return a specific chunk by its chunk_id (searches across all documents)."""
|
||||
store = _load()
|
||||
for doc in store["documents"].values():
|
||||
for chunk in doc["chunks"]:
|
||||
if chunk["chunk_id"] == chunk_id:
|
||||
return chunk
|
||||
return None
|
||||
|
||||
|
||||
def get_chunks_for_document(doc_id: str) -> list[dict]:
|
||||
"""Return all chunks for a document."""
|
||||
doc = get_document(doc_id)
|
||||
if doc is None:
|
||||
return []
|
||||
return doc["chunks"]
|
||||
|
||||
|
||||
def search_chunks(query: str, top_k: int = 5) -> list[dict]:
|
||||
"""
|
||||
Keyword search over chunk text. Returns top_k matching chunks sorted by
|
||||
term overlap (simple TF-style scoring, no embeddings required).
|
||||
|
||||
Returns list of dicts with keys: chunk_id, doc_id, score, text.
|
||||
"""
|
||||
store = _load()
|
||||
query_tokens = set(_tokenize(query))
|
||||
if not query_tokens:
|
||||
return []
|
||||
|
||||
scored: list[tuple[float, dict]] = []
|
||||
for doc in store["documents"].values():
|
||||
for chunk in doc["chunks"]:
|
||||
chunk_tokens = set(_tokenize(chunk["text"]))
|
||||
overlap = len(query_tokens & chunk_tokens)
|
||||
if overlap > 0:
|
||||
score = overlap / len(query_tokens)
|
||||
scored.append((score, {
|
||||
"chunk_id": chunk["chunk_id"],
|
||||
"doc_id": doc["id"],
|
||||
"doc_title": doc["title"],
|
||||
"score": round(score, 4),
|
||||
"text": chunk["text"],
|
||||
}))
|
||||
|
||||
scored.sort(key=lambda x: x[0], reverse=True)
|
||||
return [item for _, item in scored[:top_k]]
|
||||
|
||||
|
||||
def list_documents() -> list[dict]:
|
||||
"""Return a summary list of all stored documents (no content, no chunks)."""
|
||||
store = _load()
|
||||
return [
|
||||
{
|
||||
"id": doc["id"],
|
||||
"title": doc["title"],
|
||||
"source": doc["source"],
|
||||
"chunk_count": len(doc["chunks"]),
|
||||
"ingestion_date": doc["ingestion_date"],
|
||||
}
|
||||
for doc in store["documents"].values()
|
||||
]
|
||||
@@ -0,0 +1,202 @@
|
||||
"""
|
||||
graph_store.py — Persistent storage for graph nodes and edges.
|
||||
|
||||
Handles:
|
||||
- Adding/deduplicating nodes
|
||||
- Adding edges with confidence
|
||||
- Fetching neighbors
|
||||
- Persisting to graph.json
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
import config
|
||||
|
||||
_DATA_DIR = Path(os.environ.get("MINI_CONTEXT_GRAPH_DATA_DIR", str(config.DATA_DIR)))
|
||||
_GRAPH_FILE = _DATA_DIR / "graph.json"
|
||||
|
||||
|
||||
def _load() -> dict:
|
||||
if _GRAPH_FILE.exists():
|
||||
with open(_GRAPH_FILE, "r") as f:
|
||||
return json.load(f)
|
||||
return {"nodes": {}, "edges": []}
|
||||
|
||||
|
||||
def _save(graph: dict) -> None:
|
||||
_DATA_DIR.mkdir(parents=True, exist_ok=True)
|
||||
with open(_GRAPH_FILE, "w") as f:
|
||||
json.dump(graph, f, indent=2)
|
||||
|
||||
|
||||
def add_node(
|
||||
name: str,
|
||||
node_type: str,
|
||||
source_document: str | None = None,
|
||||
source_chunks: list[str] | None = None,
|
||||
) -> str:
|
||||
"""
|
||||
Add a node if it doesn't exist. Returns node_id.
|
||||
|
||||
Args:
|
||||
source_document: doc_id from documents_store (provenance pointer).
|
||||
source_chunks: list of chunk_ids that mention this entity.
|
||||
"""
|
||||
graph = _load()
|
||||
name_lower = name.strip().lower()
|
||||
|
||||
# Deduplication: search by normalized name
|
||||
for node_id, node in graph["nodes"].items():
|
||||
if node["name"] == name_lower:
|
||||
# Merge provenance if new info provided
|
||||
changed = False
|
||||
if source_document and node.get("source_document") is None:
|
||||
node["source_document"] = source_document
|
||||
changed = True
|
||||
if source_chunks:
|
||||
existing = set(node.get("source_chunks") or [])
|
||||
merged = list(existing | set(source_chunks))
|
||||
if merged != list(existing):
|
||||
node["source_chunks"] = merged
|
||||
changed = True
|
||||
if changed:
|
||||
_save(graph)
|
||||
return node_id
|
||||
|
||||
node_id = str(uuid.uuid4())[:8]
|
||||
graph["nodes"][node_id] = {
|
||||
"name": name_lower,
|
||||
"type": node_type.strip().lower(),
|
||||
"source_document": source_document,
|
||||
"source_chunks": source_chunks or [],
|
||||
}
|
||||
_save(graph)
|
||||
return node_id
|
||||
|
||||
|
||||
def add_edge(
|
||||
source_id: str,
|
||||
target_id: str,
|
||||
relation: str,
|
||||
confidence: float,
|
||||
source_document: str | None = None,
|
||||
supporting_text: str | None = None,
|
||||
chunk_id: str | None = None,
|
||||
) -> None:
|
||||
"""
|
||||
Add a directed edge between two nodes.
|
||||
|
||||
Args:
|
||||
source_document: doc_id from documents_store (provenance pointer).
|
||||
supporting_text: The exact text span that supports this relation.
|
||||
chunk_id: The specific chunk_id the supporting text came from.
|
||||
"""
|
||||
graph = _load()
|
||||
|
||||
# Deduplicate edges by source + target + relation
|
||||
relation_lower = relation.strip().lower()
|
||||
for edge in graph["edges"]:
|
||||
if (
|
||||
edge["source"] == source_id
|
||||
and edge["target"] == target_id
|
||||
and edge["type"] == relation_lower
|
||||
):
|
||||
changed = False
|
||||
if confidence > edge["confidence"]:
|
||||
edge["confidence"] = confidence
|
||||
changed = True
|
||||
if source_document and edge.get("source_document") is None:
|
||||
edge["source_document"] = source_document
|
||||
changed = True
|
||||
if supporting_text and edge.get("supporting_text") is None:
|
||||
edge["supporting_text"] = supporting_text
|
||||
changed = True
|
||||
if chunk_id and edge.get("chunk_id") is None:
|
||||
edge["chunk_id"] = chunk_id
|
||||
changed = True
|
||||
if changed:
|
||||
_save(graph)
|
||||
return
|
||||
|
||||
graph["edges"].append({
|
||||
"source": source_id,
|
||||
"target": target_id,
|
||||
"type": relation_lower,
|
||||
"confidence": confidence,
|
||||
"source_document": source_document,
|
||||
"supporting_text": supporting_text,
|
||||
"chunk_id": chunk_id,
|
||||
})
|
||||
_save(graph)
|
||||
|
||||
|
||||
def get_neighbors(node_id: str, min_confidence: float = 0.0) -> list[str]:
|
||||
"""Return node_ids of all neighbors reachable from node_id."""
|
||||
graph = _load()
|
||||
neighbors = []
|
||||
for edge in graph["edges"]:
|
||||
if edge["confidence"] < min_confidence:
|
||||
continue
|
||||
if edge["source"] == node_id:
|
||||
neighbors.append(edge["target"])
|
||||
elif edge["target"] == node_id:
|
||||
neighbors.append(edge["source"])
|
||||
return list(set(neighbors))
|
||||
|
||||
|
||||
def get_node(node_id: str) -> dict | None:
|
||||
"""Fetch a single node by ID."""
|
||||
graph = _load()
|
||||
return graph["nodes"].get(node_id)
|
||||
|
||||
|
||||
def get_subgraph(node_ids: list[str]) -> dict:
|
||||
"""Return nodes and edges induced by the given node_ids."""
|
||||
graph = _load()
|
||||
node_id_set = set(node_ids)
|
||||
|
||||
nodes = {nid: graph["nodes"][nid] for nid in node_ids if nid in graph["nodes"]}
|
||||
edges = [
|
||||
e
|
||||
for e in graph["edges"]
|
||||
if e["source"] in node_id_set and e["target"] in node_id_set
|
||||
]
|
||||
return {"nodes": nodes, "edges": edges}
|
||||
|
||||
|
||||
def find_node_by_name(name: str) -> str | None:
|
||||
"""Return node_id for a given normalized name, or None."""
|
||||
graph = _load()
|
||||
name_lower = name.strip().lower()
|
||||
for node_id, node in graph["nodes"].items():
|
||||
if node["name"] == name_lower:
|
||||
return node_id
|
||||
return None
|
||||
|
||||
|
||||
def link_node_to_source(node_id: str, doc_id: str, chunk_ids: list[str]) -> None:
|
||||
"""Attach provenance (doc_id + chunk_ids) to an existing node."""
|
||||
graph = _load()
|
||||
if node_id not in graph["nodes"]:
|
||||
return
|
||||
node = graph["nodes"][node_id]
|
||||
node["source_document"] = doc_id
|
||||
existing = set(node.get("source_chunks") or [])
|
||||
node["source_chunks"] = list(existing | set(chunk_ids))
|
||||
_save(graph)
|
||||
|
||||
|
||||
def get_node_sources(node_id: str) -> dict:
|
||||
"""Return provenance info (source_document + source_chunks) for a node."""
|
||||
graph = _load()
|
||||
node = graph["nodes"].get(node_id, {})
|
||||
return {
|
||||
"source_document": node.get("source_document"),
|
||||
"source_chunks": node.get("source_chunks", []),
|
||||
}
|
||||
@@ -0,0 +1,90 @@
|
||||
"""
|
||||
index_store.py — Maintains entity and keyword indexes for fast lookup.
|
||||
|
||||
Handles:
|
||||
- Entity index: name → [node_ids]
|
||||
- Keyword index: token → [node_ids]
|
||||
- Persist to index.json
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
import config
|
||||
|
||||
_DATA_DIR = Path(os.environ.get("MINI_CONTEXT_GRAPH_DATA_DIR", str(config.DATA_DIR)))
|
||||
_INDEX_FILE = _DATA_DIR / "index.json"
|
||||
|
||||
_STOPWORDS = frozenset(
|
||||
[
|
||||
"a", "an", "the", "is", "are", "was", "were", "be", "been", "being",
|
||||
"have", "has", "had", "do", "does", "did", "will", "would", "could",
|
||||
"should", "may", "might", "shall", "can", "to", "of", "in", "on",
|
||||
"at", "by", "for", "with", "from", "and", "or", "but", "not", "it",
|
||||
"its", "this", "that", "these", "those", "i", "you", "he", "she",
|
||||
"we", "they", "what", "which", "who", "how", "why", "when", "where",
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def _load() -> dict:
|
||||
if _INDEX_FILE.exists():
|
||||
with open(_INDEX_FILE, "r") as f:
|
||||
return json.load(f)
|
||||
return {"entity_index": {}, "keyword_index": {}}
|
||||
|
||||
|
||||
def _save(index: dict) -> None:
|
||||
_DATA_DIR.mkdir(parents=True, exist_ok=True)
|
||||
with open(_INDEX_FILE, "w") as f:
|
||||
json.dump(index, f, indent=2)
|
||||
|
||||
|
||||
def _tokenize(text: str) -> list[str]:
|
||||
"""Split text into lowercase tokens, removing stopwords and short tokens."""
|
||||
tokens = re.findall(r"[a-z0-9]+", text.lower())
|
||||
return [t for t in tokens if t not in _STOPWORDS and len(t) > 1]
|
||||
|
||||
|
||||
def add_entity(name: str, node_id: str) -> None:
|
||||
"""Register an entity name → node_id in both entity and keyword indexes."""
|
||||
index = _load()
|
||||
name_lower = name.strip().lower()
|
||||
|
||||
# Entity index
|
||||
if name_lower not in index["entity_index"]:
|
||||
index["entity_index"][name_lower] = []
|
||||
if node_id not in index["entity_index"][name_lower]:
|
||||
index["entity_index"][name_lower].append(node_id)
|
||||
|
||||
# Keyword index
|
||||
for token in _tokenize(name_lower):
|
||||
if token not in index["keyword_index"]:
|
||||
index["keyword_index"][token] = []
|
||||
if node_id not in index["keyword_index"][token]:
|
||||
index["keyword_index"][token].append(node_id)
|
||||
|
||||
_save(index)
|
||||
|
||||
|
||||
def search(query: str) -> list[str]:
|
||||
"""Search for node_ids matching the query via entity name or keywords."""
|
||||
index = _load()
|
||||
query_lower = query.strip().lower()
|
||||
matched_ids: set[str] = set()
|
||||
|
||||
# Exact entity name match
|
||||
if query_lower in index["entity_index"]:
|
||||
matched_ids.update(index["entity_index"][query_lower])
|
||||
|
||||
# Keyword match
|
||||
for token in _tokenize(query_lower):
|
||||
if token in index["keyword_index"]:
|
||||
matched_ids.update(index["keyword_index"][token])
|
||||
|
||||
return list(matched_ids)
|
||||
@@ -0,0 +1,175 @@
|
||||
"""
|
||||
ontology_store.py — Tracks entity types and relation types.
|
||||
|
||||
Handles:
|
||||
- Registering types and relations with usage counts
|
||||
- Normalizing types and relations via synonym mapping
|
||||
- Persisting to ontology.json
|
||||
|
||||
NOTE: No LLM logic here. Normalization is rule-based (lowercase + synonym map).
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
import config
|
||||
|
||||
_DATA_DIR = Path(os.environ.get("MINI_CONTEXT_GRAPH_DATA_DIR", str(config.DATA_DIR)))
|
||||
_ONTOLOGY_FILE = _DATA_DIR / "ontology.json"
|
||||
|
||||
# Synonym maps — lowercase variants map to canonical forms
|
||||
_ENTITY_TYPE_MAP: dict[str, str] = {
|
||||
"component": "component",
|
||||
"module": "component",
|
||||
"class": "component",
|
||||
"function": "component",
|
||||
"method": "component",
|
||||
"bug": "issue",
|
||||
"defect": "issue",
|
||||
"fault": "issue",
|
||||
"error": "issue",
|
||||
"failure": "issue",
|
||||
"problem": "issue",
|
||||
"crash": "issue",
|
||||
"server": "infrastructure",
|
||||
"host": "infrastructure",
|
||||
"machine": "infrastructure",
|
||||
"node": "infrastructure",
|
||||
"user": "actor",
|
||||
"person": "actor",
|
||||
"operator": "actor",
|
||||
"admin": "actor",
|
||||
"administrator": "actor",
|
||||
"actor": "actor",
|
||||
"app": "software",
|
||||
"application": "software",
|
||||
"service": "software",
|
||||
"program": "software",
|
||||
"software": "software",
|
||||
"database": "storage",
|
||||
"datastore": "storage",
|
||||
"db": "storage",
|
||||
"storage": "storage",
|
||||
"api": "interface",
|
||||
"endpoint": "interface",
|
||||
"interface": "interface",
|
||||
"connection": "interface",
|
||||
"event": "event",
|
||||
"incident": "event",
|
||||
"occurrence": "event",
|
||||
"trigger": "event",
|
||||
"concept": "concept",
|
||||
"idea": "concept",
|
||||
"principle": "concept",
|
||||
"theory": "concept",
|
||||
"process": "process",
|
||||
"thread": "process",
|
||||
"task": "process",
|
||||
"job": "process",
|
||||
"workflow": "process",
|
||||
"object": "component",
|
||||
"resource": "component",
|
||||
"memory": "resource",
|
||||
"cpu": "resource",
|
||||
"system": "system",
|
||||
"platform": "system",
|
||||
"framework": "system",
|
||||
"library": "software",
|
||||
"package": "software",
|
||||
}
|
||||
|
||||
_RELATION_TYPE_MAP: dict[str, str] = {
|
||||
"causes": "causes",
|
||||
"triggers": "causes",
|
||||
"leads to": "causes",
|
||||
"results in": "causes",
|
||||
"produces": "causes",
|
||||
"is part of": "contains",
|
||||
"belongs to": "contains",
|
||||
"lives in": "contains",
|
||||
"sits in": "contains",
|
||||
"contains": "contains",
|
||||
"depends on": "depends on",
|
||||
"requires": "depends on",
|
||||
"needs": "depends on",
|
||||
"uses": "uses",
|
||||
"calls": "uses",
|
||||
"invokes": "uses",
|
||||
"consumes": "uses",
|
||||
"affects": "affects",
|
||||
"impacts": "affects",
|
||||
"influences": "affects",
|
||||
"creates": "creates",
|
||||
"instantiates": "creates",
|
||||
"spawns": "creates",
|
||||
"connects to": "connects to",
|
||||
"links to": "connects to",
|
||||
"references": "connects to",
|
||||
"inherits from": "extends",
|
||||
"extends": "extends",
|
||||
"subclasses": "extends",
|
||||
"reads from": "reads from",
|
||||
"queries": "reads from",
|
||||
"fetches": "reads from",
|
||||
"writes to": "writes to",
|
||||
"stores in": "writes to",
|
||||
"persists to": "writes to",
|
||||
"contributes to": "contributes to",
|
||||
"allocated by": "allocated by",
|
||||
"released by": "released by",
|
||||
"not released": "not released",
|
||||
}
|
||||
|
||||
|
||||
def _load() -> dict:
|
||||
if _ONTOLOGY_FILE.exists():
|
||||
with open(_ONTOLOGY_FILE, "r") as f:
|
||||
return json.load(f)
|
||||
return {"entity_types": {}, "relation_types": {}}
|
||||
|
||||
|
||||
def _save(ontology: dict) -> None:
|
||||
_DATA_DIR.mkdir(parents=True, exist_ok=True)
|
||||
with open(_ONTOLOGY_FILE, "w") as f:
|
||||
json.dump(ontology, f, indent=2)
|
||||
|
||||
|
||||
def normalize_type(type_name: str) -> str:
|
||||
"""Return the canonical form of an entity type."""
|
||||
key = type_name.strip().lower().replace("-", " ").replace("_", " ")
|
||||
return _ENTITY_TYPE_MAP.get(key, key)
|
||||
|
||||
|
||||
def normalize_relation(relation_name: str) -> str:
|
||||
"""Return the canonical form of a relation type."""
|
||||
key = relation_name.strip().lower().replace("-", " ").replace("_", " ")
|
||||
return _RELATION_TYPE_MAP.get(key, key)
|
||||
|
||||
|
||||
def add_type(type_name: str) -> None:
|
||||
"""Register an entity type, incrementing its usage count."""
|
||||
ontology = _load()
|
||||
canonical = normalize_type(type_name)
|
||||
ontology["entity_types"][canonical] = ontology["entity_types"].get(canonical, 0) + 1
|
||||
_save(ontology)
|
||||
|
||||
|
||||
def add_relation(relation_name: str) -> None:
|
||||
"""Register a relation type, incrementing its usage count."""
|
||||
ontology = _load()
|
||||
canonical = normalize_relation(relation_name)
|
||||
ontology["relation_types"][canonical] = ontology["relation_types"].get(canonical, 0) + 1
|
||||
_save(ontology)
|
||||
|
||||
|
||||
def get_all_types() -> dict[str, int]:
|
||||
"""Return all registered entity types with counts."""
|
||||
return _load()["entity_types"]
|
||||
|
||||
|
||||
def get_all_relations() -> dict[str, int]:
|
||||
"""Return all registered relation types with counts."""
|
||||
return _load()["relation_types"]
|
||||
@@ -0,0 +1,58 @@
|
||||
"""
|
||||
retrieval_engine.py — BFS-based graph traversal for context retrieval.
|
||||
|
||||
Input: seed node_ids + depth
|
||||
Output: list of node_ids within traversal depth filtered by min_confidence
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from collections import deque
|
||||
|
||||
# Allow imports from parent package
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
from tools import graph_store
|
||||
import config
|
||||
|
||||
|
||||
def retrieve(
|
||||
seed_node_ids: list[str],
|
||||
depth: int = config.MAX_GRAPH_DEPTH,
|
||||
min_confidence: float = config.MIN_CONFIDENCE,
|
||||
max_nodes: int = config.MAX_NODES,
|
||||
) -> list[str]:
|
||||
"""
|
||||
BFS from seed nodes up to `depth` hops.
|
||||
|
||||
Returns a list of node_ids (including seeds) within the traversal,
|
||||
filtered by min_confidence on edges and capped at max_nodes.
|
||||
"""
|
||||
visited: set[str] = set()
|
||||
# Queue items: (node_id, current_depth)
|
||||
queue: deque[tuple[str, int]] = deque()
|
||||
|
||||
for seed in seed_node_ids:
|
||||
if seed not in visited:
|
||||
visited.add(seed)
|
||||
queue.append((seed, 0))
|
||||
|
||||
while queue:
|
||||
if len(visited) >= max_nodes:
|
||||
break
|
||||
|
||||
node_id, current_depth = queue.popleft()
|
||||
|
||||
if current_depth >= depth:
|
||||
continue
|
||||
|
||||
neighbors = graph_store.get_neighbors(node_id, min_confidence=min_confidence)
|
||||
for neighbor in neighbors:
|
||||
if neighbor not in visited:
|
||||
visited.add(neighbor)
|
||||
queue.append((neighbor, current_depth + 1))
|
||||
if len(visited) >= max_nodes:
|
||||
break
|
||||
|
||||
return list(visited)
|
||||
@@ -0,0 +1,294 @@
|
||||
"""
|
||||
wiki_store.py — Manages the persistent wiki layer.
|
||||
|
||||
Inspired by Karpathy's LLM Wiki pattern: the wiki is a directory of LLM-generated
|
||||
markdown pages that the agent writes and maintains. This module provides the
|
||||
deterministic file I/O and index/log management so the agent can focus on
|
||||
reasoning, not bookkeeping.
|
||||
|
||||
Wiki structure (relative to project root):
|
||||
wiki/
|
||||
index.md ← content-oriented catalog of all pages
|
||||
log.md ← chronological append-only operation log
|
||||
entities/ ← one page per entity (person, concept, system, etc.)
|
||||
summaries/ ← source document summary pages
|
||||
topics/ ← cross-cutting synthesis and topic pages
|
||||
|
||||
The agent WRITES pages; this module handles the filesystem + index + log.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
import config
|
||||
|
||||
_WIKI_DIR = Path(os.environ.get("MINI_CONTEXT_GRAPH_WIKI_DIR", str(config.WIKI_DIR)))
|
||||
_INDEX_FILE = _WIKI_DIR / "index.md"
|
||||
_LOG_FILE = _WIKI_DIR / "log.md"
|
||||
|
||||
_CATEGORY_DIRS = {
|
||||
"entity": _WIKI_DIR / "entities",
|
||||
"summary": _WIKI_DIR / "summaries",
|
||||
"topic": _WIKI_DIR / "topics",
|
||||
}
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Internal helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _ensure_dirs() -> None:
|
||||
_WIKI_DIR.mkdir(parents=True, exist_ok=True)
|
||||
for d in _CATEGORY_DIRS.values():
|
||||
d.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
|
||||
def _now_iso() -> str:
|
||||
return datetime.now(timezone.utc).strftime("%Y-%m-%d")
|
||||
|
||||
|
||||
def _slug(title: str) -> str:
|
||||
"""Convert a title to a filesystem-safe slug."""
|
||||
slug = title.lower().strip()
|
||||
slug = re.sub(r"[^a-z0-9]+", "-", slug)
|
||||
return slug.strip("-")
|
||||
|
||||
|
||||
def _page_path(category: str, slug: str) -> Path:
|
||||
base = _CATEGORY_DIRS.get(category, _WIKI_DIR)
|
||||
return base / f"{slug}.md"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Index management
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _load_index() -> list[dict]:
|
||||
"""Parse index.md into a list of entry dicts."""
|
||||
if not _INDEX_FILE.exists():
|
||||
return []
|
||||
entries = []
|
||||
for line in _INDEX_FILE.read_text().splitlines():
|
||||
# Expected table row: | [[slug]] | category | summary | date |
|
||||
if line.startswith("| [["):
|
||||
parts = [p.strip() for p in line.split("|") if p.strip()]
|
||||
if len(parts) >= 3:
|
||||
link = parts[0] # [[slug]]
|
||||
category = parts[1] if len(parts) > 1 else ""
|
||||
summary = parts[2] if len(parts) > 2 else ""
|
||||
date = parts[3] if len(parts) > 3 else ""
|
||||
slug = re.sub(r"\[\[|\]\]", "", link)
|
||||
entries.append({
|
||||
"slug": slug,
|
||||
"category": category,
|
||||
"summary": summary,
|
||||
"date": date,
|
||||
})
|
||||
return entries
|
||||
|
||||
|
||||
def _save_index(entries: list[dict]) -> None:
|
||||
"""Rewrite index.md from the entries list."""
|
||||
_ensure_dirs()
|
||||
lines = [
|
||||
"# Wiki Index\n",
|
||||
"_Auto-managed by wiki_store. Do not edit the table manually._\n\n",
|
||||
"| Page | Category | Summary | Date |\n",
|
||||
"|------|----------|---------|------|\n",
|
||||
]
|
||||
for e in entries:
|
||||
lines.append(
|
||||
f"| [[{e['slug']}]] | {e['category']} | {e['summary']} | {e['date']} |\n"
|
||||
)
|
||||
_INDEX_FILE.write_text("".join(lines))
|
||||
|
||||
|
||||
def _append_log(operation: str, detail: str) -> None:
|
||||
"""Append a timestamped entry to log.md."""
|
||||
_ensure_dirs()
|
||||
timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%d")
|
||||
entry = f"\n## [{timestamp}] {operation} | {detail}\n"
|
||||
with open(_LOG_FILE, "a") as f:
|
||||
f.write(entry)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Public API
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def write_page(
|
||||
category: str,
|
||||
title: str,
|
||||
content: str,
|
||||
summary: str = "",
|
||||
) -> str:
|
||||
"""
|
||||
Write (or overwrite) a wiki page.
|
||||
|
||||
The agent provides the full markdown content. This method handles:
|
||||
- Writes the .md file to the appropriate category subfolder.
|
||||
- Updates index.md with a one-line entry.
|
||||
- Appends an entry to log.md.
|
||||
|
||||
Args:
|
||||
category: One of "entity", "summary", "topic".
|
||||
title: Human-readable page title (used for slug + index).
|
||||
content: Full markdown content the agent wrote.
|
||||
summary: One-line summary for the index (optional; auto-extracted if empty).
|
||||
|
||||
Returns:
|
||||
Relative path from wiki root (e.g. "entities/memory-leak.md").
|
||||
"""
|
||||
_ensure_dirs()
|
||||
slug = _slug(title)
|
||||
path = _page_path(category, slug)
|
||||
|
||||
# Auto-extract first non-heading, non-empty line as summary if not provided
|
||||
if not summary:
|
||||
for line in content.splitlines():
|
||||
stripped = line.strip()
|
||||
if stripped and not stripped.startswith("#"):
|
||||
summary = stripped[:100]
|
||||
break
|
||||
|
||||
path.write_text(content)
|
||||
|
||||
# Update index
|
||||
entries = _load_index()
|
||||
existing = next((e for e in entries if e["slug"] == slug), None)
|
||||
if existing:
|
||||
existing["summary"] = summary
|
||||
existing["date"] = _now_iso()
|
||||
else:
|
||||
entries.append({
|
||||
"slug": slug,
|
||||
"category": category,
|
||||
"summary": summary,
|
||||
"date": _now_iso(),
|
||||
})
|
||||
_save_index(entries)
|
||||
_append_log("write", title)
|
||||
|
||||
return str(path.relative_to(_WIKI_DIR))
|
||||
|
||||
|
||||
def read_page(category: str, title: str) -> str | None:
|
||||
"""Read a wiki page's content. Returns None if not found."""
|
||||
slug = _slug(title)
|
||||
path = _page_path(category, slug)
|
||||
if not path.exists():
|
||||
return None
|
||||
return path.read_text()
|
||||
|
||||
|
||||
def read_page_by_slug(slug: str) -> str | None:
|
||||
"""Read a wiki page by slug, searching across all categories."""
|
||||
for d in list(_CATEGORY_DIRS.values()) + [_WIKI_DIR]:
|
||||
path = d / f"{slug}.md"
|
||||
if path.exists():
|
||||
return path.read_text()
|
||||
return None
|
||||
|
||||
|
||||
def search_wiki(query: str) -> list[dict]:
|
||||
"""
|
||||
Simple keyword search over all wiki pages.
|
||||
Returns list of {slug, category, path, snippet} sorted by relevance.
|
||||
"""
|
||||
query_tokens = set(re.findall(r"[a-z0-9]+", query.lower()))
|
||||
if not query_tokens:
|
||||
return []
|
||||
|
||||
results = []
|
||||
for category, base_dir in _CATEGORY_DIRS.items():
|
||||
if not base_dir.exists():
|
||||
continue
|
||||
for page_path in base_dir.glob("*.md"):
|
||||
content = page_path.read_text().lower()
|
||||
content_tokens = set(re.findall(r"[a-z0-9]+", content))
|
||||
overlap = len(query_tokens & content_tokens)
|
||||
if overlap > 0:
|
||||
# Extract a short snippet around first match
|
||||
first_token = next(iter(query_tokens & content_tokens), "")
|
||||
idx = content.find(first_token)
|
||||
snippet = content[max(0, idx - 30):idx + 80].replace("\n", " ").strip()
|
||||
results.append({
|
||||
"slug": page_path.stem,
|
||||
"category": category,
|
||||
"path": str(page_path.relative_to(_WIKI_DIR)),
|
||||
"score": overlap,
|
||||
"snippet": snippet,
|
||||
})
|
||||
|
||||
results.sort(key=lambda x: x["score"], reverse=True)
|
||||
return results
|
||||
|
||||
|
||||
def list_pages(category: str | None = None) -> list[dict]:
|
||||
"""List all wiki pages, optionally filtered by category."""
|
||||
entries = _load_index()
|
||||
if category:
|
||||
return [e for e in entries if e["category"] == category]
|
||||
return entries
|
||||
|
||||
|
||||
def get_log(last_n: int = 20) -> list[str]:
|
||||
"""Return the last N log entries from log.md."""
|
||||
if not _LOG_FILE.exists():
|
||||
return []
|
||||
lines = _LOG_FILE.read_text().splitlines()
|
||||
entries = [l for l in lines if l.startswith("## [")]
|
||||
return entries[-last_n:]
|
||||
|
||||
|
||||
def lint_wiki() -> dict:
|
||||
"""
|
||||
Health-check the wiki as described in Karpathy's LLM Wiki pattern.
|
||||
|
||||
Checks for:
|
||||
- Orphan pages (in directory but not in index)
|
||||
- Missing pages (in index but file deleted)
|
||||
- Broken wikilinks ([[slug]] pointing to non-existent file)
|
||||
- Pages with no wikilinks (isolated pages)
|
||||
|
||||
Returns:
|
||||
{
|
||||
"orphan_pages": [...],
|
||||
"missing_pages": [...],
|
||||
"broken_wikilinks": {slug: [broken_links]},
|
||||
"isolated_pages": [...],
|
||||
}
|
||||
"""
|
||||
index_entries = {e["slug"] for e in _load_index()}
|
||||
file_slugs: dict[str, Path] = {}
|
||||
for d in _CATEGORY_DIRS.values():
|
||||
if d.exists():
|
||||
for p in d.glob("*.md"):
|
||||
file_slugs[p.stem] = p
|
||||
|
||||
orphans = [s for s in file_slugs if s not in index_entries]
|
||||
missing = [s for s in index_entries if s not in file_slugs]
|
||||
|
||||
broken_wikilinks: dict[str, list[str]] = {}
|
||||
isolated: list[str] = []
|
||||
all_slugs = set(file_slugs.keys())
|
||||
|
||||
for slug, path in file_slugs.items():
|
||||
content = path.read_text()
|
||||
links = re.findall(r"\[\[([^\]]+)\]\]", content)
|
||||
if not links:
|
||||
isolated.append(slug)
|
||||
broken = [lnk for lnk in links if _slug(lnk) not in all_slugs]
|
||||
if broken:
|
||||
broken_wikilinks[slug] = broken
|
||||
|
||||
return {
|
||||
"orphan_pages": orphans,
|
||||
"missing_pages": missing,
|
||||
"broken_wikilinks": broken_wikilinks,
|
||||
"isolated_pages": isolated,
|
||||
}
|
||||
Reference in New Issue
Block a user