Files
awesome-copilot/skills/quality-playbook/quality_gate.py
T
Andrew Stellman b8441d218b Update quality-playbook skill to v1.5.6 + add agent (#1402)
Rebuilds branch from upstream/staged (was previously merged from
upstream/main, which brought in materialized plugin files that
fail Check Plugin Structure on PRs targeting staged).

Changes vs. staged:
- Update skills/quality-playbook/ to v1.5.6 (31 bundled assets:
  SKILL.md + LICENSE.txt + 16 references/ + 9 phase_prompts/ +
  3 agents/ + bin/citation_verifier.py + quality_gate.py).
- Add agents/quality-playbook.agent.md (top-level orchestrator).
  name: quality-playbook (validator-compliant).
- Update docs/README.skills.md quality-playbook row description
  + bundled-assets list to v1.5.6.
- Fix 'unparseable' → 'unparsable' in quality_gate.py (5 instances;
  codespell preference, both spellings valid).

Closes the v1.4.0 → v1.5.6 update in a single clean commit on top of
upstream/staged. The preserved backup branch backup-bedbe84-pre-rebuild
(SHA bedbe848fa3c0f0eda8e653c42b599a17dd2e354) holds the prior history for reference.
2026-05-11 11:31:53 +10:00

3386 lines
128 KiB
Python
Executable File
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""quality_gate.py — Post-run validation gate for Quality Playbook artifacts.
Mechanically checks artifact conformance issues that model self-attestation
persistently misses. Now the sole gate script; the earlier quality_gate.sh
(bash) has been retired. See quality_gate/test_quality_gate.py for the test
suite.
Usage:
./quality_gate.py . # Check current directory (benchmark mode)
./quality_gate.py --general . # Check with relaxed thresholds
./quality_gate.py virtio # Check named repo (from repos/)
./quality_gate.py --all # Check all current-version repos
./quality_gate.py --version 1.3.27 virtio # Check specific version
Exit codes:
0 — all checks passed
1 — one or more checks failed
Runs on Python 3.8+ with only the standard library.
"""
import json
import os
import re
import sys
from datetime import date
from pathlib import Path
SCRIPT_DIR = Path(__file__).resolve().parent
# Allow soft import of bin/citation_verifier for v1.5.1 byte-equality checks.
# The verifier may live at one of several locations depending on where the
# gate was installed:
# 1. <QPB-clone>/bin/citation_verifier.py — gate runs from the source tree
# (gate path: <clone>/.github/skills/quality_gate/quality_gate.py;
# bin/ is three parents up from SCRIPT_DIR).
# 2. <install-root>/bin/citation_verifier.py — gate installed alongside
# bin/ at the install root (v1.5.6 BUG-005 fix; bin/install_skill.py
# and repos/setup_repos.sh both bundle bin/citation_verifier.py here).
# 3. <install-root>/bin/citation_verifier.py via the nested-skills path
# (.github/skills/quality_gate.py — SCRIPT_DIR is .github/skills, and
# bin/ is two parents up).
# When none of these resolve, byte-equality is skipped with a WARN rather
# than a hard FAIL — the gate continues with reduced enforcement.
_CITATION_VERIFIER = None
_VERIFIER_SEARCH_ROOTS = [
SCRIPT_DIR.parent.parent.parent, # source-clone layout
SCRIPT_DIR, # gate + bin/ siblings (uncommon)
SCRIPT_DIR.parent.parent, # nested-skills layout (.github/skills/quality_gate.py)
]
for _candidate_root in _VERIFIER_SEARCH_ROOTS:
_verifier_file = _candidate_root / "bin" / "citation_verifier.py"
if _verifier_file.is_file():
try:
if str(_candidate_root) not in sys.path:
sys.path.insert(0, str(_candidate_root))
from bin import citation_verifier as _CITATION_VERIFIER # noqa: E402
break
except Exception: # noqa: BLE001 — missing / misinstalled bin/ is tolerable
_CITATION_VERIFIER = None
continue
# Global counters — reset per invocation via main(). Tests that call check_repo
# directly should reset these in setUp.
FAIL = 0
WARN = 0
# v1.5.2 — REQ Pattern field (Lever 2)
VALID_PATTERN_VALUES = frozenset({"whitelist", "parity", "compensation"})
_REQ_PATTERN_RE = re.compile(
r"^\s*-\s*Pattern:\s*(\S+)\s*$", re.IGNORECASE | re.MULTILINE
)
def extract_req_pattern(req_block):
"""Return the REQ's pattern tag from a REQUIREMENTS.md block, or None.
Raises ValueError when the block carries an invalid pattern value. Valid
values are VALID_PATTERN_VALUES. Absent field returns None.
"""
m = _REQ_PATTERN_RE.search(req_block)
if not m:
return None
value = m.group(1).strip()
if value not in VALID_PATTERN_VALUES:
raise ValueError(
"Invalid REQ pattern '{}'. Expected one of: {}".format(
value, sorted(VALID_PATTERN_VALUES)
)
)
return value
# v1.5.2 — cardinality gate (Lever 3)
VALID_REASON_CLASSES = frozenset({
"out-of-scope",
"deprecated",
"platform-gated",
"handled-upstream",
"intentionally-partial",
})
_CELL_ID_RE = re.compile(r"^REQ-\d+/cell-[A-Za-z0-9_]+-[A-Za-z0-9_]+$")
_COVERS_RE = re.compile(
r"^\s*-\s*Covers:\s*\[(.*?)\]\s*$", re.IGNORECASE | re.MULTILINE
)
_CONSOLIDATION_RE = re.compile(
r"^\s*-\s*Consolidation rationale:\s*(.+?)\s*$",
re.IGNORECASE | re.MULTILINE,
)
_BUG_HEADING_RE = re.compile(r"^###\s+BUG-(\d+):", re.MULTILINE)
# v1.5.2 (C13.8/Fix 1) — evidence locator for present:true grid cells.
# Relative path (no leading '/'), single colon, line number (>=1) or
# range ``N-M`` with both endpoints >=1. Rejects: absolute paths,
# multi-slash roots, URLs, line zero, zero-endpoint ranges.
_EVIDENCE_RE = re.compile(r"^(?!/)[^:]+:[1-9]\d*(-[1-9]\d*)?$")
def _parse_covers(bug_block):
m = _COVERS_RE.search(bug_block)
if not m:
return []
raw = m.group(1).strip()
if not raw:
return []
items = [s.strip() for s in raw.split(",")]
return [s for s in items if s]
def _parse_consolidation_rationale(bug_block):
m = _CONSOLIDATION_RE.search(bug_block)
if not m:
return None
text = m.group(1).strip()
return text or None
def _split_bug_blocks(bugs_md_text):
"""Return list of (bug_id, body) pairs."""
positions = [(m.start(), m.group(1)) for m in _BUG_HEADING_RE.finditer(bugs_md_text)]
result = []
for idx, (start, bug_id) in enumerate(positions):
end = positions[idx + 1][0] if idx + 1 < len(positions) else len(bugs_md_text)
result.append(("BUG-{}".format(bug_id), bugs_md_text[start:end]))
return result
def _bug_primary_requirement(block):
m = re.search(
r"^\s*-\s*Primary requirement:\s*(REQ-\d+)", block, re.MULTILINE | re.IGNORECASE
)
return m.group(1) if m else None
def _load_json_or_none(path):
if not path.is_file():
return None
try:
return json.loads(path.read_text(encoding="utf-8"))
except (OSError, json.JSONDecodeError):
return None
def _read_text_safe(path):
try:
return path.read_text(encoding="utf-8", errors="replace")
except OSError:
return ""
_REQ_HEADING_RE = re.compile(r"^###\s+(REQ-\d+):", re.MULTILINE)
def _enumerate_pattern_tagged_reqs(req_text):
"""Return {req_id: pattern} for every ### REQ-NNN: block in REQUIREMENTS.md
that carries a ``- Pattern: <value>`` line.
Raises ValueError if any block's pattern value is not in
VALID_PATTERN_VALUES (delegated to extract_req_pattern()). Blocks without a
Pattern field are omitted from the result (they're not pattern-tagged).
"""
if not req_text:
return {}
positions = [(m.start(), m.group(1)) for m in _REQ_HEADING_RE.finditer(req_text)]
result = {}
for idx, (start, req_id) in enumerate(positions):
end = positions[idx + 1][0] if idx + 1 < len(positions) else len(req_text)
block = req_text[start:end]
pattern = extract_req_pattern(block)
if pattern is not None:
result[req_id] = pattern
return result
# v1.5.2 (C13.7/Fix 2) — per-site UC detection.
# Phase 1's Cartesian UC rule emits UC-N.a / UC-N.b / ... for REQs where both
# eligibility gates match. Any REQ block in REQUIREMENTS.md that cites such
# per-site UCs MUST carry a Pattern field — otherwise Phase 2 silently dropped
# it. The regex is deliberately narrow: one lowercase letter suffix only, word
# boundaries on both sides, so bare UC-N and over-suffixed UC-N.a.bad are not
# mistaken for per-site references.
_PER_SITE_UC_RE = re.compile(r"\bUC-\d+\.[a-z]\b")
def _enumerate_per_site_uc_reqs(req_text):
"""Return {req_id: sorted_list_of_uc_ids} for every ### REQ-NNN: block
that cites at least one per-site UC reference (UC-N.a / UC-N.b / ...).
REQ blocks without per-site UC references are omitted from the result.
Each returned UC list is deduplicated and lexically sorted.
"""
if not req_text:
return {}
positions = [(m.start(), m.group(1)) for m in _REQ_HEADING_RE.finditer(req_text)]
result = {}
for idx, (start, req_id) in enumerate(positions):
end = positions[idx + 1][0] if idx + 1 < len(positions) else len(req_text)
block = req_text[start:end]
ucs = sorted(set(_PER_SITE_UC_RE.findall(block)))
if ucs:
result[req_id] = ucs
return result
def validate_cardinality_gate(repo_dir):
"""Run the v1.5.2 cardinality reconciliation gate.
Returns a list of failure strings. An empty list means the gate passed.
Caller decides how to surface failures (print / fail()).
Inputs expected in repo_dir/quality/:
- REQUIREMENTS.md (source of pattern-tagged REQs)
- BUGS.md (source of Covers: annotations)
- compensation_grid.json (source of cell set per REQ)
- compensation_grid_downgrades.json (optional; source of downgrade cells)
"""
failures = []
q = Path(repo_dir) / "quality"
req_text = _read_text_safe(q / "REQUIREMENTS.md")
# Enumerate pattern-tagged and per-site-UC REQs up front so the
# downstream cross-checks can run regardless of whether a grid file
# exists. A REQ that cites per-site UCs but lacks Pattern is a failure
# independent of grid presence (in fact, if Pattern is missing there is
# no grid precisely because Pattern is the trigger for producing one).
try:
pattern_tagged = _enumerate_pattern_tagged_reqs(req_text)
except ValueError as exc:
failures.append("REQUIREMENTS.md: {}".format(exc))
pattern_tagged = {}
try:
per_site = _enumerate_per_site_uc_reqs(req_text)
except ValueError as exc:
failures.append("REQUIREMENTS.md: {}".format(exc))
per_site = {}
# Cross-check (C13.7/Fix 2): every REQ that cites per-site UCs (UC-N.a,
# UC-N.b, ...) in REQUIREMENTS.md MUST carry a Pattern field. Per-site UCs
# are the structural signal emitted by Phase 1's Cartesian UC rule; if the
# signal is there but Pattern is missing, Phase 2 silently dropped it and
# the v1.4.5 regression vector is live again. Runs regardless of grid
# presence because missing Pattern is exactly what would cause the grid
# to be absent in the first place.
for req_id, uc_ids in per_site.items():
if req_id not in pattern_tagged:
failures.append(
"cardinality gate: {} has per-site UCs ({}) in REQUIREMENTS.md "
"but is missing the Pattern field — Phase 1 Cartesian UC rule "
"requires Pattern tagging for cross-site REQs (see "
"phase1_prompt confirmation checklist item 6)".format(
req_id, ", ".join(uc_ids)
)
)
grid_path = q / "compensation_grid.json"
grid = _load_json_or_none(grid_path)
if grid is None:
# No grid file: only a problem if any pattern-tagged REQs exist.
if _REQ_PATTERN_RE.search(req_text):
failures.append(
"cardinality gate: pattern-tagged REQs exist but "
"quality/compensation_grid.json is missing"
)
return failures
reqs = grid.get("reqs") or {}
if not isinstance(reqs, dict):
failures.append("compensation_grid.json: 'reqs' is not an object")
return failures
# Cross-check: every pattern-tagged REQ in REQUIREMENTS.md must appear in
# the grid. Omitting a pattern-tagged REQ from the grid was a v1.5.2 escape
# hatch (silently skipped by the per-REQ reconcile loop); close it here.
for req_id, req_pattern in pattern_tagged.items():
if req_id not in reqs:
failures.append(
"cardinality gate: {} is pattern-tagged '{}' in REQUIREMENTS.md "
"but has no entry in compensation_grid.json".format(req_id, req_pattern)
)
# Load BUGS.md and index covers by REQ
bugs_text = _read_text_safe(q / "BUGS.md")
covers_by_req = {}
for bug_id, block in _split_bug_blocks(bugs_text):
covers = _parse_covers(block)
if len(covers) >= 2:
if not _parse_consolidation_rationale(block):
failures.append(
"{}: Covers has {} entries but 'Consolidation rationale:' is missing or empty".format(
bug_id, len(covers)
)
)
for cell_id in covers:
if not _CELL_ID_RE.match(cell_id):
failures.append(
"{}: malformed cell ID '{}' (expected REQ-N/cell-<item>-<site>)".format(
bug_id, cell_id
)
)
continue
req_id = cell_id.split("/", 1)[0]
covers_by_req.setdefault(req_id, set()).add(cell_id)
# Load downgrades and validate each record
downgrades = _load_json_or_none(q / "compensation_grid_downgrades.json") or {"downgrades": []}
downgrade_cells_by_req = {}
for rec in downgrades.get("downgrades", []):
rid = rec.get("cell_id", "")
if not _CELL_ID_RE.match(rid):
failures.append("downgrade record: malformed cell_id '{}'".format(rid))
continue
# A downgrade record only counts toward reconciliation once every
# validation below passes. A malformed record emits diagnostic
# failure strings AND stays out of downgrade_cells_by_req, so the
# per-REQ uncovered-cells calculation still flags the cell.
rec_ok = True
for field in ("authority_ref", "site_citation", "reason_class", "falsifiable_claim"):
value = rec.get(field)
if not value or not isinstance(value, str) or not value.strip():
failures.append(
"downgrade record {}: missing or empty field '{}'".format(rid, field)
)
rec_ok = False
reason = rec.get("reason_class", "")
if reason and reason not in VALID_REASON_CLASSES:
failures.append(
"downgrade record {}: reason_class '{}' not in {}".format(
rid, reason, sorted(VALID_REASON_CLASSES)
)
)
rec_ok = False
if not rec_ok:
continue
req_id = rid.split("/", 1)[0]
downgrade_cells_by_req.setdefault(req_id, set()).add(rid)
# Reconcile per-REQ
for req_id, entry in reqs.items():
pattern = entry.get("pattern")
if pattern not in {"whitelist", "parity", "compensation"}:
failures.append(
"compensation_grid.json: {} has invalid or missing pattern '{}'".format(
req_id, pattern
)
)
continue
cells = entry.get("cells") or []
# v1.5.2 (C13.8/Fix 2): pre-validate each cell's 'present' field is a
# strict bool. Non-bool values (string "true", int 1, None, missing key)
# would otherwise fall between the 'is False' absent-cell branch and
# the 'is not True' present-cell evidence branch, escaping both checks.
# Same silent-bypass family as B1 — diagnose AND skip the cell, do not
# let it count toward coverage accounting.
valid_cells = []
for c in cells:
if not isinstance(c, dict):
continue
present = c.get("present")
if not isinstance(present, bool):
cell_id = c.get("cell_id") or "<no cell_id>"
failures.append(
"{}: cell {} 'present' must be boolean true or false; got {!r}".format(
req_id, cell_id, present
)
)
continue
valid_cells.append(c)
grid_cell_ids = {c.get("cell_id") for c in valid_cells}
grid_cell_ids.discard(None)
# Only absent cells require coverage. Identity check is safe now —
# every element of valid_cells has 'present' as a strict bool.
absent_cells = {
c.get("cell_id") for c in valid_cells
if c.get("present") is False
}
absent_cells.discard(None)
# v1.5.2 (C13.6/B2): present:true cells must carry a non-empty
# 'evidence' field in file:line form. Without this, a reviewer or LLM
# can claim any cell is present, supply nothing, and the gate accepts
# it — the bypass Round 5 Council called the highest remaining risk.
for c in valid_cells:
if c.get("present") is not True:
continue
cell_id = c.get("cell_id") or "<no cell_id>"
evidence = c.get("evidence")
if not evidence or not isinstance(evidence, str) or not evidence.strip():
failures.append(
"{}: present:true requires non-empty 'evidence' field with file:line citation".format(cell_id)
)
continue
if not _EVIDENCE_RE.match(evidence.strip()):
failures.append(
"{}: 'evidence' must be file:line (e.g. 'path/to.c:123' or 'path/to.c:120-140'); got {!r}".format(
cell_id, evidence
)
)
covered = covers_by_req.get(req_id, set())
downgraded = downgrade_cells_by_req.get(req_id, set())
uncovered = absent_cells - covered - downgraded
if uncovered:
failures.append(
"{}: uncovered cells — {}".format(req_id, ", ".join(sorted(uncovered)))
)
# Every covered cell must be in the grid
stray = (covered | downgraded) - grid_cell_ids
if stray:
failures.append(
"{}: Covers/downgrade cells not in grid — {}".format(
req_id, ", ".join(sorted(stray))
)
)
return failures
def _reset_counters():
global FAIL, WARN
FAIL = 0
WARN = 0
def fail(msg, reason=None, *, line=None):
"""Emit a structured failure line and increment FAIL.
Phase 5 r3 format: `<path>[:<line>]: <reason>` — no "FAIL:" label, so
output is grep-parseable as `^[^:]+:[0-9]*:? .+$`. The prefix `FAIL:` is
deliberately removed; the global FAIL counter (summarised in main()) is
the authoritative count of failures per run.
Preferred forms:
fail("quality/INDEX.md", "file missing")
-> " quality/INDEX.md: file missing"
fail("quality/INDEX.md", "missing required field 'x'", line=42)
-> " quality/INDEX.md:42: missing required field 'x'"
Legacy single-arg form (transitional; still supported — most v1.4.x
messages already embed a path-like token):
fail("BUGS.md missing or not a file")
-> " BUGS.md missing or not a file"
"""
global FAIL
if reason is None:
print(f" {msg}")
elif line is None:
print(f" {msg}: {reason}")
else:
print(f" {msg}:{line}: {reason}")
FAIL += 1
def pass_(msg):
print(f" PASS: {msg}")
def warn(msg):
global WARN
print(f" WARN: {msg}")
WARN += 1
def info(msg):
print(f" INFO: {msg}")
# --- JSON helpers (proper parsing, not grep-style) ---
def load_json(path):
"""Parse JSON file. Return parsed value, or None on any error."""
if not path.is_file():
return None
try:
with open(path, "r", encoding="utf-8") as f:
return json.load(f)
except (OSError, json.JSONDecodeError):
return None
def has_key(data, key):
"""True if `data` is a dict containing `key`."""
return isinstance(data, dict) and key in data
def get_str(data, key):
"""Return data[key] if it's a string, else empty string."""
if not isinstance(data, dict):
return ""
val = data.get(key)
return val if isinstance(val, str) else ""
def count_per_bug_field(bugs_list, field):
"""Count bugs in list that have `field` set."""
if not isinstance(bugs_list, list):
return 0
return sum(1 for b in bugs_list if isinstance(b, dict) and field in b)
# --- File helpers ---
# v1.5.4 Phase 3.6.4 (B-16): the end-of-run reorg moves intermediate
# pipeline artifacts under quality/workspace/. The gate reads each of
# those subdirectories at multiple sites; _resolve_artifact_path
# centralises the dual-layout lookup so each site stays one-line.
# Top-level wins (legacy / pre-reorg layout); workspace/ is the v1.5.4
# canonical location after _finalize_quality_layout has run.
_WORKSPACE_DIRS = (
"control_prompts",
"results",
"code_reviews",
"spec_audits",
"patches",
"writeups",
"mechanical",
"phase3",
)
def _resolve_artifact_path(quality_dir, name):
"""Return the live path for an intermediate artifact directory or
file under quality/. Tries top-level first (the legacy / current
in-flight layout), then quality/workspace/<name> (the v1.5.4
end-of-run reorg layout). Returns the top-level path even when
neither exists so callers that test ``.is_dir()`` / ``.is_file()``
get a False rather than an exception.
``name`` may be a single segment (``"results"``) or a path with
segments (``"results/tdd-results.json"``); both forms work
regardless of layout."""
top = quality_dir / name
if top.exists():
return top
workspace = quality_dir / "workspace" / name
if workspace.exists():
return workspace
return top
def has_file_matching(directory, patterns):
"""True if any file in `directory` (non-recursive) matches any glob pattern."""
if not directory.is_dir():
return False
for pat in patterns:
for _ in directory.glob(pat):
return True
return False
def count_files_matching(directory, pattern):
"""Count files in `directory` (non-recursive) matching glob pattern."""
if not directory.is_dir():
return 0
return sum(1 for _ in directory.glob(pattern))
def first_file_matching(directory, patterns):
"""Return first matching path or None."""
if not directory.is_dir():
return None
for pat in patterns:
for p in directory.glob(pat):
return p
return None
def file_contains(path, pattern):
"""True if any line in file matches pattern (regex string or compiled)."""
if not path.is_file():
return False
if isinstance(pattern, str):
pattern = re.compile(pattern)
try:
with open(path, "r", encoding="utf-8", errors="replace") as f:
for line in f:
if pattern.search(line):
return True
except OSError:
pass
return False
def read_first_line_stripped(path):
"""Return first line of file with whitespace stripped."""
if not path.is_file():
return ""
try:
with open(path, "r", encoding="utf-8", errors="replace") as f:
line = f.readline()
except OSError:
return ""
return re.sub(r"\s", "", line)
def validate_iso_date(date_str):
"""Return one of: 'valid', 'placeholder', 'future', 'bad_format', 'empty'.
Placeholders are checked before format so that 'YYYY-MM-DD' is reported
as 'placeholder' rather than 'bad_format'. The bash version's order was
flipped, causing 'YYYY-MM-DD' to be misreported — both still FAIL but the
Python version gives the clearer message.
"""
if not date_str:
return "empty"
if date_str in ("YYYY-MM-DD", "0000-00-00"):
return "placeholder"
date_part = date_str[:10]
if not re.fullmatch(r"\d{4}-\d{2}-\d{2}", date_part):
return "bad_format"
if len(date_str) > 10 and not re.fullmatch(r"T\d{2}:\d{2}:\d{2}(Z|[+-]\d{2}:\d{2})?", date_str[10:]):
return "bad_format"
today = date.today().isoformat()
if date_part > today:
return "future"
return "valid"
def detect_skill_version(locations):
"""Read `version:` value from the first existing SKILL.md-like file."""
for loc in locations:
if loc.is_file():
try:
with open(loc, "r", encoding="utf-8", errors="replace") as f:
for line in f:
m = re.match(r"^\s*(?:version:|\*\*Version:\*\*)\s*([0-9]+(?:\.[0-9]+)+)\b",
line, re.IGNORECASE)
if m:
return m.group(1)
except OSError:
continue
return ""
def read_skill_value_line(path, prefix):
"""Mimic: grep -m1 'prefix' FILE | sed 's/.*prefix *//' | tr -d ' '."""
if not path.is_file():
return ""
try:
with open(path, "r", encoding="utf-8", errors="replace") as f:
for line in f:
if prefix in line:
v = re.sub(rf".*{re.escape(prefix)}\s*", "", line, count=1)
return v.replace(" ", "").rstrip("\n").rstrip("\r")
except OSError:
pass
return ""
def detect_project_language(repo_dir):
"""Walk up to 3 dirs deep, return first language whose extension is present.
Mirrors bash `find -maxdepth 3 -not -path ...` behavior.
"""
language_order = [
("go", ".go"),
("py", ".py"),
("java", ".java"),
("kt", ".kt"),
("rs", ".rs"),
("ts", ".ts"),
("js", ".js"),
("scala", ".scala"),
("c", ".c"),
("agc", ".agc"),
]
excluded = {"vendor", "node_modules", ".git", "quality", "repos"}
def present(base, target_ext):
stack = [(Path(base), 1)]
while stack:
curr, depth = stack.pop()
try:
for entry in os.scandir(curr):
name = entry.name
if entry.is_dir(follow_symlinks=False):
if name in excluded:
continue
if depth < 3:
stack.append((Path(entry.path), depth + 1))
elif entry.is_file(follow_symlinks=False):
if name.endswith(target_ext):
return True
except (OSError, PermissionError):
continue
return False
for lang, ext in language_order:
if present(repo_dir, ext):
return lang
return ""
def count_source_files(repo_dir):
"""Count source files up to 4 dirs deep, excluding vendor/node_modules/etc."""
src_count = 0
exts = {".go", ".py", ".java", ".kt", ".rs", ".ts", ".js", ".scala",
".c", ".h", ".agc"}
excluded = {"vendor", "node_modules", ".git", "quality"}
def walk(base, current_depth, max_depth):
nonlocal src_count
try:
for entry in os.scandir(base):
name = entry.name
if entry.is_dir(follow_symlinks=False):
if current_depth < max_depth and name not in excluded:
walk(entry.path, current_depth + 1, max_depth)
elif entry.is_file(follow_symlinks=False):
dot = name.rfind(".")
if dot >= 0 and name[dot:] in exts:
src_count += 1
except (OSError, PermissionError):
pass
walk(str(repo_dir), 1, 4)
return src_count
# --- Section checks ---
def check_file_existence(repo_dir, q, strictness):
"""File existence section (benchmark 40)."""
print("[File Existence]")
for f in ["BUGS.md", "REQUIREMENTS.md", "QUALITY.md", "PROGRESS.md",
"COVERAGE_MATRIX.md", "COMPLETENESS_REPORT.md"]:
if (q / f).is_file():
pass_(f"{f} exists")
else:
fail(f"{f} missing")
for f in ["CONTRACTS.md", "RUN_CODE_REVIEW.md", "RUN_SPEC_AUDIT.md",
"RUN_INTEGRATION_TESTS.md", "RUN_TDD_TESTS.md"]:
if (q / f).is_file():
pass_(f"{f} exists")
else:
fail(f"{f} missing")
if has_file_matching(q, ["test_functional.*", "functional_test.*",
"FunctionalSpec.*", "FunctionalTest.*",
"functional.test.*"]):
pass_("functional test file exists")
else:
fail("functional test file missing (test_functional.*, functional_test.*, FunctionalSpec.*, FunctionalTest.*, functional.test.*)")
if (repo_dir / "AGENTS.md").is_file():
pass_("AGENTS.md exists")
else:
fail("AGENTS.md missing (required at project root)")
if (q / "EXPLORATION.md").is_file():
pass_("EXPLORATION.md exists")
_check_exploration_sections(q / "EXPLORATION.md")
else:
fail("EXPLORATION.md missing")
cr_dir = _resolve_artifact_path(q, "code_reviews")
if cr_dir.is_dir() and has_file_matching(cr_dir, ["*.md"]):
pass_("code_reviews/ has .md files")
else:
fail("code_reviews/ missing or empty")
sa_dir = _resolve_artifact_path(q, "spec_audits")
if sa_dir.is_dir():
triage_count = count_files_matching(sa_dir, "*triage*")
auditor_count = count_files_matching(sa_dir, "*auditor*")
if triage_count > 0:
pass_("spec_audits/ has triage file")
else:
fail("spec_audits/ missing triage file")
if auditor_count > 0:
pass_(f"spec_audits/ has {auditor_count} auditor file(s)")
else:
fail("spec_audits/ missing individual auditor files")
if triage_count > 0:
has_probes = False
if (sa_dir / "triage_probes.sh").is_file():
has_probes = True
pass_("triage_probes.sh exists (executable triage evidence)")
elif (_resolve_artifact_path(q, "mechanical/verify.sh")).is_file() and \
file_contains(_resolve_artifact_path(q, "mechanical/verify.sh"), r"probe|triage|auditor"):
has_probes = True
pass_("verify.sh contains triage probe assertions")
if not has_probes:
msg = "No executable triage evidence found (expected spec_audits/triage_probes.sh or probe assertions in mechanical/verify.sh)"
if strictness == "benchmark":
fail(msg)
else:
warn(msg)
else:
fail("spec_audits/ directory missing")
def check_bugs_heading(q):
"""BUGS.md heading-format section (benchmark 39).
Returns (bug_count, bug_ids).
"""
print("[BUGS.md Heading Format]")
bugs_md = q / "BUGS.md"
if not bugs_md.is_file():
fail("BUGS.md missing")
return 0, []
try:
bugs_content = bugs_md.read_text(encoding="utf-8", errors="replace")
except OSError:
bugs_content = ""
lines = bugs_content.splitlines()
correct_headings = sum(1 for ln in lines
if re.match(r"^### BUG-([HML]|[0-9])[0-9]*", ln))
wrong_headings = sum(1 for ln in lines
if re.match(r"^## BUG-", ln)
and not re.match(r"^### BUG-", ln))
deep_headings = sum(1 for ln in lines
if re.match(r"^#{4,} BUG-([HML]|[0-9])", ln))
bold_headings = sum(1 for ln in lines
if re.match(r"^\*\*BUG-([HML]|[0-9])", ln))
bullet_headings = sum(1 for ln in lines
if re.match(r"^- BUG-([HML]|[0-9])", ln))
bug_count = correct_headings
if (correct_headings > 0 and wrong_headings == 0 and deep_headings == 0
and bold_headings == 0 and bullet_headings == 0):
pass_(f"All {correct_headings} bug headings use ### BUG-NNN format")
else:
if wrong_headings > 0:
fail(f"{wrong_headings} heading(s) use ## instead of ###")
if deep_headings > 0:
fail(f"{deep_headings} heading(s) use #### or deeper instead of ###")
if bold_headings > 0:
fail(f"{bold_headings} heading(s) use **BUG- format")
if bullet_headings > 0:
fail(f"{bullet_headings} heading(s) use - BUG- format")
if correct_headings == 0 and wrong_headings == 0:
if re.search(r"^##\s+(No confirmed bugs|Zero confirmed bugs)\s*$",
bugs_content, re.MULTILINE | re.IGNORECASE):
pass_("Zero-bug run — no headings expected")
else:
bug_count = wrong_headings + deep_headings + bold_headings + bullet_headings
warn("No ### BUG-NNN headings found in BUGS.md")
else:
bug_count = correct_headings + wrong_headings + bold_headings + bullet_headings
# Extract canonical bug IDs: BUG-NNN or BUG-HNN / BUG-MNN / BUG-LNN
raw = re.findall(r"BUG-(?:[HML][0-9]+|[0-9]+)", bugs_content)
filtered = [b for b in raw if re.fullmatch(r"BUG-(?:[HML][0-9]+|[0-9]+)", b)]
bug_ids = sorted(set(filtered))
return bug_count, bug_ids
def check_tdd_sidecar(q, bug_count):
"""TDD sidecar JSON (benchmarks 14, 41)."""
print("[TDD Sidecar JSON]")
json_path = _resolve_artifact_path(q, "results/tdd-results.json")
if bug_count <= 0:
info("Zero bugs — tdd-results.json not required")
return None
if not json_path.is_file():
fail(f"tdd-results.json missing ({bug_count} bugs require it)")
return None
pass_(f"tdd-results.json exists ({bug_count} bugs)")
data = load_json(json_path)
if data is None:
# File exists but unparsable — fail all root key checks
for key in ["schema_version", "skill_version", "date", "project",
"bugs", "summary"]:
fail(f"missing root key '{key}'")
fail("schema_version is 'missing', expected '1.1'")
return None
for key in ["schema_version", "skill_version", "date", "project",
"bugs", "summary"]:
if has_key(data, key):
pass_(f"has '{key}'")
else:
fail(f"missing root key '{key}'")
sv = get_str(data, "schema_version")
if sv == "1.1":
pass_("schema_version is '1.1'")
else:
fail(f"schema_version is '{sv or 'missing'}', expected '1.1'")
bugs_list = data.get("bugs") if isinstance(data, dict) else None
if not isinstance(bugs_list, list):
bugs_list = []
for field in ["id", "requirement", "red_phase", "green_phase",
"verdict", "fix_patch_present", "writeup_path"]:
fcount = count_per_bug_field(bugs_list, field)
if fcount >= bug_count:
pass_(f"per-bug field '{field}' present ({fcount}x)")
elif fcount > 0:
warn(f"per-bug field '{field}' found {fcount}x, expected {bug_count}")
else:
fail(f"per-bug field '{field}' missing entirely")
# Non-canonical field names (at any level — check root and bugs)
bad_fields = ["bug_id", "bug_name", "status", "phase", "result"]
for bad in bad_fields:
found = has_key(data, bad) or any(
has_key(b, bad) for b in bugs_list if isinstance(b, dict)
)
if found:
fail(f"non-canonical field '{bad}' found (use standard field names)")
summary = data.get("summary") if isinstance(data, dict) else None
if not isinstance(summary, dict):
summary = {}
for skey in ["total", "verified", "confirmed_open", "red_failed", "green_failed"]:
if skey in summary:
pass_(f"summary has '{skey}'")
else:
fail(f"summary missing '{skey}' count")
# Date validation
tdd_date = get_str(data, "date")
status = validate_iso_date(tdd_date)
if status == "empty":
fail("tdd-results.json date field missing or empty")
elif status == "bad_format":
fail(f"tdd-results.json date '{tdd_date}' is not ISO 8601 (YYYY-MM-DD)")
elif status == "placeholder":
fail(f"tdd-results.json date is placeholder '{tdd_date}'")
elif status == "future":
fail(f"tdd-results.json date '{tdd_date}' is in the future")
else:
pass_(f"tdd-results.json date '{tdd_date}' is valid")
# Verdict enum
allowed_verdicts = {"TDD verified", "red failed", "green failed",
"confirmed open", "deferred"}
bad_verdicts = 0
for b in bugs_list:
if isinstance(b, dict) and "verdict" in b:
v = b.get("verdict")
if v not in allowed_verdicts:
bad_verdicts += 1
if bad_verdicts == 0:
pass_("all verdict values are canonical")
else:
fail(f"{bad_verdicts} non-canonical verdict value(s)")
return data
def check_tdd_logs(q, bug_count, bug_ids, tdd_data):
"""TDD log files and sidecar-to-log cross-validation."""
print("[TDD Log Files]")
if bug_count <= 0:
info("Zero bugs — TDD log files not required")
return
patches_dir = _resolve_artifact_path(q, "patches")
results_dir = _resolve_artifact_path(q, "results")
valid_tags = {"RED", "GREEN", "NOT_RUN", "ERROR"}
red_found = 0
red_missing = 0
green_found = 0
green_missing = 0
green_expected = 0
red_bad_tag = 0
green_bad_tag = 0
for bid in bug_ids:
red_log = results_dir / f"{bid}.red.log"
if red_log.is_file():
red_found += 1
tag = read_first_line_stripped(red_log)
if tag not in valid_tags:
red_bad_tag += 1
else:
red_missing += 1
fix_patch = first_file_matching(patches_dir, [f"{bid}-fix*.patch"])
if fix_patch is not None:
green_expected += 1
green_log = results_dir / f"{bid}.green.log"
if green_log.is_file():
green_found += 1
tag = read_first_line_stripped(green_log)
if tag not in valid_tags:
green_bad_tag += 1
else:
green_missing += 1
if red_missing == 0 and red_found > 0:
pass_(f"All {red_found} confirmed bug(s) have red-phase logs")
elif red_found > 0:
fail(f"{red_missing} confirmed bug(s) missing red-phase log (BUG-NNN.red.log)")
else:
fail("No red-phase logs found (every confirmed bug needs quality/results/BUG-NNN.red.log)")
if green_expected > 0:
if green_missing == 0:
pass_(f"All {green_found} bug(s) with fix patches have green-phase logs")
else:
fail(f"{green_missing} bug(s) with fix patches missing green-phase log (BUG-NNN.green.log)")
else:
info("No fix patches found — green-phase logs not required")
if red_bad_tag > 0:
fail(f"{red_bad_tag} red-phase log(s) missing valid first-line status tag (expected RED/GREEN/NOT_RUN/ERROR)")
elif red_found > 0:
pass_("All red-phase logs have valid status tags")
if green_bad_tag > 0:
fail(f"{green_bad_tag} green-phase log(s) missing valid first-line status tag (expected RED/GREEN/NOT_RUN/ERROR)")
elif green_found > 0:
pass_("All green-phase logs have valid status tags")
# Sidecar-to-log cross-validation (BUG-M18)
if tdd_data is not None and isinstance(tdd_data, dict):
bugs_list = tdd_data.get("bugs") or []
if not isinstance(bugs_list, list):
bugs_list = []
# Index bugs by id for lookup
bug_by_id = {}
for b in bugs_list:
if isinstance(b, dict) and isinstance(b.get("id"), str):
bug_by_id[b["id"]] = b
xv_checked = 0
xv_mismatch = 0
for bid in bug_ids:
bug_obj = bug_by_id.get(bid)
sidecar_red = get_str(bug_obj, "red_phase") if bug_obj else ""
sidecar_green = get_str(bug_obj, "green_phase") if bug_obj else ""
red_log = results_dir / f"{bid}.red.log"
if sidecar_red and red_log.is_file():
log_tag = read_first_line_stripped(red_log)
xv_checked += 1
if sidecar_red == "fail" and log_tag != "RED":
xv_mismatch += 1
fail(f"{bid}: sidecar red_phase='{sidecar_red}' but log first-line is '{log_tag}' (expected RED)")
elif sidecar_red == "pass" and log_tag != "GREEN":
xv_mismatch += 1
fail(f"{bid}: sidecar red_phase='{sidecar_red}' but log first-line is '{log_tag}' (expected GREEN)")
green_log = results_dir / f"{bid}.green.log"
if sidecar_green and green_log.is_file():
log_tag = read_first_line_stripped(green_log)
xv_checked += 1
if sidecar_green == "pass" and log_tag != "GREEN":
xv_mismatch += 1
fail(f"{bid}: sidecar green_phase='{sidecar_green}' but log first-line is '{log_tag}' (expected GREEN)")
elif sidecar_green == "fail" and log_tag != "RED":
xv_mismatch += 1
fail(f"{bid}: sidecar green_phase='{sidecar_green}' but log first-line is '{log_tag}' (expected RED)")
if xv_checked > 0 and xv_mismatch == 0:
pass_(f"Sidecar-to-log cross-validation passed ({xv_checked} checks)")
elif xv_checked == 0:
info("Sidecar-to-log cross-validation: no matching pairs to check")
# TDD_TRACEABILITY.md
if red_found > 0:
if (q / "TDD_TRACEABILITY.md").is_file():
pass_(f"TDD_TRACEABILITY.md exists ({red_found} bugs with red-phase results)")
else:
fail("TDD_TRACEABILITY.md missing (mandatory when bugs have red-phase results)")
def check_integration_sidecar(q, strictness):
"""Integration sidecar JSON section."""
print("[Integration Sidecar JSON]")
ij = _resolve_artifact_path(q, "results/integration-results.json")
if not ij.is_file():
if strictness == "benchmark":
warn("integration-results.json not present")
else:
info("integration-results.json not present (optional in general mode)")
return
data = load_json(ij)
for key in ["schema_version", "skill_version", "date", "project",
"recommendation", "groups", "summary", "uc_coverage"]:
if has_key(data, key):
pass_(f"has '{key}'")
else:
fail(f"missing key '{key}'")
summary = data.get("summary") if isinstance(data, dict) else None
if not isinstance(summary, dict):
summary = {}
for iskey in ["total_groups", "passed", "failed", "skipped"]:
if iskey in summary:
pass_(f"integration summary has '{iskey}'")
else:
fail(f"integration summary missing required sub-key '{iskey}'")
isv = get_str(data, "schema_version")
if isv == "1.1":
pass_("integration schema_version is '1.1'")
else:
fail(f"integration schema_version is '{isv or 'missing'}', expected '1.1'")
int_date = get_str(data, "date")
if int_date: # match bash: if [ -n "$int_date" ]
status = validate_iso_date(int_date)
if status == "bad_format":
fail(f"integration-results.json date '{int_date}' is not ISO 8601 (YYYY-MM-DD)")
elif status == "placeholder":
fail(f"integration-results.json date is placeholder '{int_date}'")
elif status == "future":
fail(f"integration-results.json date '{int_date}' is in the future")
else:
pass_(f"integration-results.json date '{int_date}' is valid")
rec = get_str(data, "recommendation")
if rec in ("SHIP", "FIX BEFORE MERGE", "BLOCK"):
pass_(f"recommendation '{rec}' is canonical")
elif rec:
fail(f"recommendation '{rec}' is non-canonical (must be SHIP/FIX BEFORE MERGE/BLOCK)")
else:
fail("recommendation missing")
# groups[].result enum
allowed_results = {"pass", "fail", "skipped", "error"}
bad_results = 0
groups = data.get("groups") if isinstance(data, dict) else None
if isinstance(groups, list):
for g in groups:
if isinstance(g, dict) and "result" in g:
if g.get("result") not in allowed_results:
bad_results += 1
if bad_results == 0:
pass_("all groups[].result values are canonical")
else:
fail(f"{bad_results} non-canonical groups[].result value(s) (must be pass/fail/skipped/error)")
# uc_coverage value enum
allowed_uc = {"covered_pass", "covered_fail", "not_mapped"}
bad_uc = 0
uc_cov = data.get("uc_coverage") if isinstance(data, dict) else None
if isinstance(uc_cov, dict):
for v in uc_cov.values():
if v not in allowed_uc:
bad_uc += 1
if bad_uc == 0:
pass_("all uc_coverage values are canonical")
else:
fail(f"{bad_uc} non-canonical uc_coverage value(s) (must be covered_pass/covered_fail/not_mapped)")
def check_recheck_sidecar(q):
"""Recheck sidecar JSON (schema 1.0, uses 'results' key not 'bugs')."""
print("[Recheck Sidecar JSON]")
rj = _resolve_artifact_path(q, "results/recheck-results.json")
rs = _resolve_artifact_path(q, "results/recheck-summary.md")
if not rj.is_file():
info("recheck-results.json not present (only required when recheck mode was run)")
return
pass_("recheck-results.json exists")
data = load_json(rj)
# SKILL.md recheck template uses 'results' as the array key, not 'bugs'.
for key in ["schema_version", "skill_version", "date", "project",
"results", "summary"]:
if has_key(data, key):
pass_(f"recheck has '{key}'")
else:
fail(f"recheck missing root key '{key}'")
rsv = get_str(data, "schema_version")
if rsv == "1.0":
pass_("recheck schema_version is '1.0'")
else:
fail(f"recheck schema_version is '{rsv or 'missing'}', expected '1.0'")
rdate = get_str(data, "date")
if rdate:
status = validate_iso_date(rdate)
if status == "bad_format":
fail(f"recheck-results.json date '{rdate}' is not ISO 8601 (YYYY-MM-DD)")
elif status == "placeholder":
fail(f"recheck-results.json date is placeholder '{rdate}'")
elif status == "future":
fail(f"recheck-results.json date '{rdate}' is in the future")
else:
pass_(f"recheck-results.json date '{rdate}' is valid")
if rs.is_file():
pass_("recheck-summary.md exists")
else:
fail("recheck-summary.md missing (required companion to recheck-results.json)")
def check_use_cases(repo_dir, q, strictness):
"""Use case identifier section (benchmarks 43, 48)."""
print("[Use Cases]")
req_md = q / "REQUIREMENTS.md"
if not req_md.is_file():
fail("REQUIREMENTS.md missing")
return
try:
req_content = req_md.read_text(encoding="utf-8", errors="replace")
except OSError:
req_content = ""
# uc_ids: count of lines matching UC-N (bash grep -cE counts lines)
uc_ids = sum(1 for ln in req_content.splitlines()
if re.search(r"UC-[0-9]+", ln))
uc_unique = len(set(re.findall(r"UC-[0-9]+", req_content)))
src_count = count_source_files(repo_dir) if repo_dir.is_dir() else 0
min_uc = 3 if src_count < 5 else 5
if uc_unique >= min_uc:
pass_(f"Found {uc_unique} distinct UC identifiers ({uc_ids} total references, {src_count} source files)")
elif uc_unique > 0:
connector = "for" if strictness == "general" else "required for"
msg = f"Only {uc_unique} distinct UC identifiers (minimum {min_uc} {connector} {src_count} source files)"
if strictness == "general":
warn(msg)
else:
fail(msg)
else:
fail("No canonical UC-NN identifiers in REQUIREMENTS.md")
def check_test_file_extension(repo_dir, q):
"""Test file extension matches project language (benchmark 47)."""
print("[Test File Extension]")
func_test = first_file_matching(q, ["test_functional.*", "functional_test.*",
"FunctionalSpec.*", "FunctionalTest.*",
"functional.test.*"])
reg_test = first_file_matching(q, ["test_regression.*"])
if func_test is None:
warn("No functional test file found across the supported naming matrix")
return
ext = func_test.suffix.lstrip(".") if func_test.suffix else ""
detected_lang = detect_project_language(repo_dir) if repo_dir.is_dir() else ""
if not detected_lang:
info(f"Cannot detect project language — skipping extension check (test_functional.{ext})")
return
lang_to_valid = {
"go": "go",
"py": "py",
"java": "java",
"kt": "kt java",
"rs": "rs",
"ts": "ts",
"js": "js ts",
"scala": "scala",
"c": "c py sh",
"agc": "py sh",
}
valid_ext = lang_to_valid.get(detected_lang, "")
valid_list = valid_ext.split()
primary = valid_list[0] if valid_list else ""
if ext in valid_list:
pass_(f"{func_test.name} matches project language ({detected_lang})")
else:
fail(f"{func_test.name} does not match project language ({detected_lang}) — expected .{primary}")
if reg_test is not None:
reg_ext = reg_test.suffix.lstrip(".") if reg_test.suffix else ""
if reg_ext in valid_list:
pass_(f"test_regression.{reg_ext} matches project language ({detected_lang})")
else:
fail(f"test_regression.{reg_ext} does not match project language ({detected_lang}) — expected .{primary}")
def check_terminal_gate(q):
"""Terminal Gate section in PROGRESS.md."""
print("[Terminal Gate]")
progress_md = q / "PROGRESS.md"
if not progress_md.is_file():
return
pat = re.compile(r"^#+ *Terminal", re.IGNORECASE | re.MULTILINE)
if file_contains(progress_md, pat):
pass_("PROGRESS.md has Terminal Gate section")
else:
fail("PROGRESS.md missing Terminal Gate section")
def check_mechanical(q):
"""Mechanical verification section."""
print("[Mechanical Verification]")
mech_dir = _resolve_artifact_path(q, "mechanical")
if not mech_dir.is_dir():
info("No mechanical/ directory")
return
verify_sh = mech_dir / "verify.sh"
if not verify_sh.is_file():
fail("mechanical/ exists but verify.sh missing")
return
pass_("verify.sh exists")
mv_log = _resolve_artifact_path(q, "results/mechanical-verify.log")
mv_exit = _resolve_artifact_path(q, "results/mechanical-verify.exit")
if mv_log.is_file() and mv_exit.is_file():
try:
exit_code = mv_exit.read_text(encoding="utf-8", errors="replace")
except OSError:
exit_code = ""
exit_code = re.sub(r"\s", "", exit_code)
if exit_code == "0":
pass_("mechanical-verify.exit is 0")
else:
fail(f"mechanical-verify.exit is '{exit_code}', expected 0")
else:
fail("Verification receipt files missing")
def check_patches(q, bug_count, bug_ids, strictness):
"""Patches section (benchmark 44)."""
print("[Patches]")
if bug_count <= 0:
return
patches_dir = _resolve_artifact_path(q, "patches")
# Regression test file — required when bugs exist
reg_test_file = None
if q.is_dir():
reg_files = sorted(q.glob("test_regression.*"))
if reg_files:
reg_test_file = reg_files[0]
if reg_test_file is not None:
pass_(f"test_regression.* exists ({bug_count} confirmed bugs require it)")
else:
msg = "test_regression.* missing — required when bugs exist (SKILL.md artifact contract)"
if strictness == "benchmark":
fail(msg)
else:
warn(msg)
reg_patch_count = 0
fix_patch_count = 0
reg_patch_missing = 0
for bid in bug_ids:
if first_file_matching(patches_dir, [f"{bid}-regression*.patch"]) is not None:
reg_patch_count += 1
else:
reg_patch_missing += 1
if first_file_matching(patches_dir, [f"{bid}-fix*.patch"]) is not None:
fix_patch_count += 1
if reg_patch_missing == 0 and reg_patch_count > 0:
pass_(f"{reg_patch_count} regression-test patch(es) for {bug_count} bug(s)")
elif reg_patch_count > 0:
fail(f"{reg_patch_missing} bug(s) missing regression-test patch")
else:
fail("No regression-test patches found (quality/patches/BUG-NNN-regression-test.patch required)")
if fix_patch_count > 0:
pass_(f"{fix_patch_count} fix patch(es)")
else:
warn("0 fix patches (fix patches are optional but strongly encouraged)")
total_patches = reg_patch_count + fix_patch_count
info(f"Total: {total_patches} patch file(s) in quality/patches/")
# Unfilled-template sentinel phrases produced by the Phase 5 writeup stub.
# Presence of any of these strings in a writeup is strong evidence that the
# template was emitted without hydrating its content fields from BUGS.md.
# See bin/run_playbook.py::phase5_prompt for the generating prompt.
_WRITEUP_TEMPLATE_SENTINELS = (
"is a confirmed code bug in ``",
"The affected implementation lives at ``",
"Patch path: ``",
"- Regression test: ``",
"- Regression patch: ``",
)
# Matches a ```diff fenced block and captures its body for content inspection.
_WRITEUP_DIFF_BLOCK_RE = re.compile(r"```diff\s*\n(.*?)```", re.DOTALL | re.IGNORECASE)
def _writeup_diff_is_non_empty(text):
"""True if any ```diff block in ``text`` contains at least one unified-diff
line (a `+` or `-` that is not the `+++`/`---` file-header prefix)."""
for block in _WRITEUP_DIFF_BLOCK_RE.findall(text):
for line in block.splitlines():
stripped = line.lstrip()
if stripped.startswith("+++") or stripped.startswith("---"):
continue
if stripped.startswith(("+", "-")):
return True
return False
def check_writeups(q, bug_count):
"""Bug writeups section (benchmark 30)."""
print("[Bug Writeups]")
if bug_count <= 0:
return
writeups_dir = _resolve_artifact_path(q, "writeups")
writeup_count = 0
writeup_diff_count = 0
empty_diff_writeups = []
sentinel_writeups = []
if writeups_dir.is_dir():
writeup_files = sorted(p for p in writeups_dir.glob("BUG-*.md") if p.is_file())
writeup_count = len(writeup_files)
for wf in writeup_files:
try:
text = wf.read_text(encoding="utf-8", errors="replace")
except OSError:
continue
# Presence test uses the same regex as the content test so the
# two can never disagree on whether a fence exists. Case-insensitive
# match accepts ```diff / ```Diff / ```DIFF uniformly — operators
# routinely uppercase the fence tag and the gate must not silently
# skip those writeups (the content non-emptiness check would then
# never fire, producing a confusing "no inline fix diffs" FAIL on a
# writeup that visibly contains a unified diff).
if _WRITEUP_DIFF_BLOCK_RE.search(text):
writeup_diff_count += 1
if not _writeup_diff_is_non_empty(text):
empty_diff_writeups.append(wf.name)
if any(s in text for s in _WRITEUP_TEMPLATE_SENTINELS):
sentinel_writeups.append(wf.name)
if writeup_count >= bug_count:
pass_(f"{writeup_count} writeup(s) for {bug_count} bug(s)")
elif writeup_count > 0:
fail(f"{writeup_count} writeup(s) for {bug_count} bug(s) — all confirmed bugs require writeups (SKILL.md line 1454)")
else:
fail(f"No writeups for {bug_count} confirmed bug(s)")
if writeup_count > 0:
if writeup_diff_count >= writeup_count:
pass_(f"All {writeup_diff_count} writeup(s) have inline fix diffs")
elif writeup_diff_count > 0:
fail(f"Only {writeup_diff_count}/{writeup_count} writeup(s) have inline fix diffs (all require section 6 diff)")
else:
fail("No writeups have inline fix diffs (section 6 'The fix' must include a ```diff block)")
# Non-empty-diff content check. A ```diff fence with no `+`/`-` body
# is a template stub — the legacy presence-only check let these pass.
if empty_diff_writeups:
preview = ", ".join(empty_diff_writeups[:5])
suffix = f" (+{len(empty_diff_writeups) - 5} more)" if len(empty_diff_writeups) > 5 else ""
fail(
f"{len(empty_diff_writeups)} writeup(s) have empty ```diff blocks "
f"(fence present, no +/- lines): {preview}{suffix}"
)
else:
pass_("All writeup ```diff blocks contain unified-diff content")
# Template-sentinel check. Any of these strings remaining in a writeup
# means the Phase 5 stub was emitted without hydrating from BUGS.md.
if sentinel_writeups:
preview = ", ".join(sentinel_writeups[:5])
suffix = f" (+{len(sentinel_writeups) - 5} more)" if len(sentinel_writeups) > 5 else ""
fail(
f"{len(sentinel_writeups)} writeup(s) contain unfilled template "
f"sentinels (empty backticks after 'is a confirmed code bug in', "
f"'The affected implementation lives at', 'Patch path:', "
f"'Regression test:', or 'Regression patch:'): {preview}{suffix}"
)
else:
pass_("No writeups contain unfilled template sentinels")
def check_version_stamps(repo_dir, q):
"""Version stamp consistency (benchmark 26). Returns detected skill_version."""
print("[Version Stamps]")
skill_version = detect_skill_version([
repo_dir / "SKILL.md",
repo_dir / ".claude" / "skills" / "quality-playbook" / "SKILL.md",
repo_dir / ".github" / "skills" / "SKILL.md",
repo_dir / ".github" / "skills" / "quality-playbook" / "SKILL.md",
SCRIPT_DIR / ".." / "SKILL.md",
SCRIPT_DIR / "SKILL.md",
])
if not skill_version:
warn("Cannot detect skill version from SKILL.md")
return skill_version
progress_md = q / "PROGRESS.md"
if progress_md.is_file():
pv = read_skill_value_line(progress_md, "Skill version:")
if pv == skill_version:
pass_(f"PROGRESS.md version matches ({skill_version})")
elif pv:
fail(f"PROGRESS.md version '{pv}' != '{skill_version}'")
else:
warn("PROGRESS.md missing Skill version field")
json_path = _resolve_artifact_path(q, "results/tdd-results.json")
if json_path.is_file():
data = load_json(json_path)
tv = get_str(data, "skill_version")
if tv == skill_version:
pass_("tdd-results.json skill_version matches")
elif tv:
fail(f"tdd-results.json skill_version '{tv}' != '{skill_version}'")
return skill_version
def check_cross_run_contamination(repo_dir, q, version_arg, skill_version):
"""Cross-run contamination detection."""
print("[Cross-Run Contamination]")
repo_name = repo_dir.name
if skill_version and version_arg:
matches = re.findall(r"[0-9]+\.[0-9]+\.[0-9]+", repo_name)
dir_version = matches[-1] if matches else ""
if dir_version and dir_version != skill_version:
fail(f"Directory version '{dir_version}' != skill version '{skill_version}' — possible cross-run contamination")
else:
pass_("No version mismatch detected")
json_path = _resolve_artifact_path(q, "results/tdd-results.json")
if json_path.is_file() and skill_version:
data = load_json(json_path)
json_sv = get_str(data, "skill_version")
if json_sv and json_sv != skill_version:
fail(f"tdd-results.json skill_version '{json_sv}' != SKILL.md '{skill_version}' — stale artifacts from prior run?")
def _check_exploration_sections(path):
"""Check that EXPLORATION.md contains all required section titles."""
required_sections = [
"## Open Exploration Findings",
"## Quality Risks",
"## Pattern Applicability Matrix",
"## Candidate Bugs for Phase 2",
"## Gate Self-Check",
]
try:
content = path.read_text(encoding="utf-8", errors="replace")
except OSError as exc:
fail(f"EXPLORATION.md unreadable: {exc}")
return
for section in required_sections:
if section not in content:
fail(f"EXPLORATION.md missing required section: {section!r}")
def check_run_metadata(q):
"""Validate the run-metadata sidecar JSON (run-YYYY-MM-DDTHH-MM-SS.json)."""
print("[Run Metadata]")
results_dir = _resolve_artifact_path(q, "results")
pattern = str(results_dir / "run-*.json")
import glob as _glob
matches = _glob.glob(pattern)
if not matches:
fail("run-metadata JSON missing (expected quality/results/run-YYYY-MM-DDTHH-MM-SS.json)")
return
if len(matches) > 1:
warn(f"Multiple run-metadata files found: {len(matches)}")
filename_re = re.compile(r"run-\d{4}-\d{2}-\d{2}T\d{2}-\d{2}-\d{2}\.json$")
for path in matches:
if not filename_re.search(path):
fail(f"run-metadata filename does not match expected format: {path}")
data = load_json(Path(path))
if data is None:
fail(f"run-metadata JSON parse error: {path}")
continue
required_fields = ("schema_version", "skill_version", "project", "model", "runner", "start_time")
for field in required_fields:
if not data.get(field):
fail(f"run-metadata missing or empty field: {field!r}")
pass_("run-metadata JSON present")
# --- Per-repo entry point ---
# ---------------------------------------------------------------------------
# v1.5.1 Layer-1 mechanical invariants (schemas.md §10).
#
# Each check gracefully no-ops on pre-v1.5.1 runs (absent manifests = legacy
# repo; nothing to enforce). When the v1.5.1 artifacts are present every
# invariant below is enforced mechanically and FAILs with a specific
# <path>: <reason> message so the operator can fix the single artifact
# without re-running the whole playbook.
# ---------------------------------------------------------------------------
_V150_VALID_DISPOSITIONS = (
"code-fix",
"spec-fix",
"upstream-spec-issue",
"mis-read",
"deferred",
)
_V150_VALID_FIX_TYPES = ("code", "spec", "both")
_V150_ILLEGAL_FIX_PAIRS = {
("code-fix", "spec"),
("spec-fix", "code"),
("upstream-spec-issue", "code"),
("mis-read", "both"),
}
_V150_SUPPORTED_EXTENSIONS = (".txt", ".md")
# v1.5.4 Part 1 / Round 1 Council finding C2-1: INDEX schema is now
# version-routed. New runs MUST emit schema_version "2.0" with
# target_role_breakdown; legacy archives carry schema_version "1.0"
# (or no schema_version at all) with target_project_type. The fields
# common to both schemas live in _V150_INDEX_COMMON_FIELDS; the
# version-specific fields live in their own tuples and are picked at
# validation time.
#
# v1.5.4 Round 2 Council finding C1: SCHEMA_VERSION_CURRENT pins the
# version this gate understands. Future schemas (>2.0) refuse with an
# explicit error rather than silently downgrading to legacy. When a
# v1.5.5+ run bumps the schema, also bump this constant; otherwise the
# new gate version will reject the new INDEX shape on purpose.
SCHEMA_VERSION_CURRENT = "2.0"
_V150_INDEX_COMMON_FIELDS = (
"run_timestamp_start",
"run_timestamp_end",
"duration_seconds",
"qpb_version",
"target_repo_path",
"target_repo_git_sha",
"phases_executed",
"summary",
"artifacts",
)
_V150_INDEX_LEGACY_FIELDS = ("target_project_type",)
_V154_INDEX_CURRENT_FIELDS = ("target_role_breakdown",)
# Legacy alias: a small number of pre-iteration tests still import
# _V150_REQUIRED_INDEX_FIELDS expecting a single tuple. Preserve the
# alias under the v1.5.4-current contract; the version-routed
# enforcement happens inside check_v1_5_0_index_md.
_V150_REQUIRED_INDEX_FIELDS = (
_V150_INDEX_COMMON_FIELDS + _V154_INDEX_CURRENT_FIELDS
)
_V150_REQUIRED_SUMMARY_KEYS = ("requirements", "bugs", "gate_verdict")
# ---------------------------------------------------------------------------
# v1.5.3 — schema extensions (schemas.md §3.6–§3.10, §4.1, §6.1, §8.1, §10
# invariants #21#23). Field-presence detection (§3.10) toggles the
# v1.5.3 invariants on per-manifest, NOT a schema_version comparison.
# ---------------------------------------------------------------------------
_V153_VALID_SOURCE_TYPES = (
"code-derived",
"skill-section",
"reference-file",
"execution-observation",
# v1.5.6 (QG-fail-2 from the v1.5.6 self-bootstrap): REQs derived from
# operator-supplied informal documentation under the target repo's
# `reference_docs/` tree. Distinct from `reference-file`, which
# schemas.md §3.7 ties to QPB-shipped reference files under
# `references/`. The Phase 2 LLM disambiguates the two evidence
# sources by name; the schema and gate now match.
"docs-derived",
)
_V153_VALID_DIVERGENCE_TYPES = (
"code-spec",
"internal-prose",
"prose-to-code",
"execution",
)
_V153_VALID_FORMAL_DOC_ROLES = (
"external-spec",
"project-spec",
"skill-self-spec",
"skill-reference",
)
# DQ-3 (v1.5.3 Phase 3 / Round 2 Council): the v1.5.3 field-presence
# detection key set is module-level so a regression test can pin it
# against schemas.md's enum-bearing field list. A future schema
# addition (e.g., a fifth v1.5.3-only field) that updates ONLY this
# constant without updating the test's literal will fail the regression
# test, forcing lockstep maintenance and surfacing the change for
# explicit review.
_V153_FIELD_KEYS = frozenset({"source_type", "divergence_type", "role"})
def _is_v1_5_3_shaped(manifest):
"""Return True iff any record in *manifest* carries a v1.5.3 field.
Walks the records (or `reviews`) once. Presence of any key in
_V153_FIELD_KEYS on any record toggles strict-mode validation per
schemas.md §3.10. Empty / unparsable manifests return False so
legacy fixtures stay on the soft-warn path.
DQ-3 design note: the checked-key set is sourced from
_V153_FIELD_KEYS (a module-level frozenset) rather than hardcoded
in this function's body. A regression test in
test_quality_gate.py::TestV153FieldKeysContract pins
_V153_FIELD_KEYS against the literal `{"source_type",
"divergence_type", "role"}` so a future maintainer adding a
v1.5.3-only field to the schema cannot silently miss updating the
detection helper.
"""
if not isinstance(manifest, dict):
return False
records = manifest.get("records")
if not isinstance(records, list):
records = manifest.get("reviews") if isinstance(
manifest.get("reviews"), list
) else []
for rec in records:
if not isinstance(rec, dict):
continue
if not _V153_FIELD_KEYS.isdisjoint(rec.keys()):
return True
return False
def _v150_manifest(q, name):
"""Return the parsed top-level JSON object or None if absent/invalid."""
path = q / name
if not path.is_file():
return None
data = load_json(path)
if isinstance(data, dict):
return data
fail(f"{path.name}: not a valid JSON object (schemas.md §1.6)")
return None
def check_v1_5_0_cite_extensions(repo_dir):
"""§10 invariant #9 — reference_docs/cite/ contains only .txt/.md.
v1.5.2 collapsed the old formal_docs/+informal_docs/ split into a single
reference_docs/ tree with reference_docs/cite/ holding citable material.
The plaintext-only constraint now applies to that cite folder; the check
retains the v1.5.0 invariant ancestry (hence the _v1_5_0_ name prefix).
"""
folder = repo_dir / "reference_docs" / "cite"
if not folder.is_dir():
return
any_file = False
for path in sorted(folder.rglob("*")):
if not path.is_file():
continue
any_file = True
if path.name == "README.md":
continue
if path.name.endswith(".meta.json"):
continue
# v1.5.6 (QG-fail-1 from the v1.5.6 self-bootstrap): `.gitkeep`
# is the documented sentinel that pins `reference_docs/cite/`
# in version control even when adopters have no citable
# plaintext yet. The pre-flight expects it to exist; the gate
# must not reject it.
if path.name == ".gitkeep":
continue
ext = path.suffix.lower()
if ext not in _V150_SUPPORTED_EXTENSIONS:
rel = path.relative_to(repo_dir).as_posix()
fail(
f"{rel}: unsupported extension {ext or '(none)'} under reference_docs/cite/ "
"(schemas.md §2 allows only .txt, .md; §10 invariant #9)"
)
if any_file:
pass_("reference_docs/cite/: all files use supported extensions")
def check_v1_5_0_manifest_wrappers(q):
"""§10 invariant #13 — manifest wrapper shape.
Four record-shaped manifests (formal_docs / requirements / use_cases /
bugs) use `records`; citation_semantic_check.json uses `reviews`
(schemas.md §9.1). Every manifest must carry schema_version +
generated_at as non-empty strings.
"""
record_shaped = (
"formal_docs_manifest.json",
"requirements_manifest.json",
"use_cases_manifest.json",
"bugs_manifest.json",
)
for name in record_shaped:
data = _v150_manifest(q, name)
if data is None:
continue
for key in ("schema_version", "generated_at"):
if not isinstance(data.get(key), str) or not data[key]:
fail(f"{name}: missing or empty top-level {key!r} (schemas.md §1.6)")
if not isinstance(data.get("records"), list):
fail(f"{name}: missing or non-array top-level 'records' (schemas.md §1.6)")
if "reviews" in data:
fail(
f"{name}: has 'reviews' key — reserved for citation_semantic_check.json "
"per schemas.md §9.1 / §10 invariant #13"
)
else:
pass_(f"{name}: manifest wrapper valid")
data = _v150_manifest(q, "citation_semantic_check.json")
if data is not None:
for key in ("schema_version", "generated_at"):
if not isinstance(data.get(key), str) or not data[key]:
fail(
f"citation_semantic_check.json: missing or empty top-level {key!r} "
"(schemas.md §1.6)"
)
if not isinstance(data.get("reviews"), list):
fail(
"citation_semantic_check.json: missing or non-array top-level 'reviews' "
"(schemas.md §9.1 — semantic check uses 'reviews', not 'records')"
)
if "records" in data:
fail(
"citation_semantic_check.json: has 'records' key — semantic check uses "
"'reviews' per schemas.md §9.1 / §10 invariant #13"
)
else:
pass_("citation_semantic_check.json: manifest wrapper valid")
def _check_citation_block(repo_dir, req_id, citation, formal_docs_by_path, req_tier):
excerpt = citation.get("citation_excerpt")
if not isinstance(excerpt, str) or not excerpt:
fail(
"requirements_manifest.json",
f"record_id={req_id}: citation has empty or missing citation_excerpt "
"(schemas.md §10 invariant #4)",
)
return
doc_path_str = citation.get("document")
if not isinstance(doc_path_str, str) or not doc_path_str:
fail(
"requirements_manifest.json",
f"record_id={req_id}: citation missing 'document' field",
)
return
section = citation.get("section")
line = citation.get("line")
has_section = isinstance(section, str) and section.strip()
has_line = isinstance(line, int) and not isinstance(line, bool)
if not has_section and not has_line:
fail(
"requirements_manifest.json",
f"record_id={req_id}: citation has no section or line locator "
"(page alone is insufficient; schemas.md §10 invariant #4)",
)
return
fd_rec = formal_docs_by_path.get(doc_path_str)
if fd_rec is None:
fail(
"requirements_manifest.json",
f"record_id={req_id}: citation document {doc_path_str!r} "
"not in formal_docs_manifest.json (schemas.md §10 invariant #2)",
)
return
fd_tier = fd_rec.get("tier")
if fd_tier != req_tier:
fail(
"requirements_manifest.json",
f"record_id={req_id}: tier={req_tier} does not match cited FORMAL_DOC "
f"tier={fd_tier!r} (schemas.md §10 invariant #14)",
)
fd_sha = fd_rec.get("document_sha256")
cite_sha = citation.get("document_sha256")
if isinstance(fd_sha, str) and isinstance(cite_sha, str) and fd_sha != cite_sha:
fail(
"requirements_manifest.json",
f"record_id={req_id}: citation.document_sha256 does not match FORMAL_DOC "
"(schemas.md §10 invariant #3 — citation_stale)",
)
if _CITATION_VERIFIER is None:
warn(
f"requirements_manifest.json: record_id={req_id}: byte-equality skipped — "
"bin/citation_verifier unavailable on this install"
)
return
doc_path = repo_dir / doc_path_str
if not doc_path.is_file():
fail(
"requirements_manifest.json",
f"record_id={req_id}: citation document not on disk: {doc_path_str}",
)
return
try:
bytes_ = doc_path.read_bytes()
fresh = _CITATION_VERIFIER.extract_excerpt(
bytes_, doc_path.suffix.lower(), section if has_section else None,
line if has_line else None,
)
except _CITATION_VERIFIER.CitationResolutionError as exc:
fail(
"requirements_manifest.json",
f"record_id={req_id}: citation location does not resolve in "
f"{doc_path_str}: {exc.message} (schemas.md §10 invariant #4)",
)
return
except Exception as exc: # noqa: BLE001 — fail with a real message
fail(
"requirements_manifest.json",
f"record_id={req_id}: citation verifier errored: {exc}",
)
return
if fresh != excerpt:
fail(
"requirements_manifest.json",
f"record_id={req_id}: citation_excerpt is not byte-equal to fresh "
f"extraction from {doc_path_str} "
"(schemas.md §10 invariant #11 — Layer-1 anti-hallucination)",
)
def check_v1_5_0_requirements_manifest(repo_dir, q):
"""§10 invariants #1, #4, #8, #11, #14 — REQ shape, citation gating, functional_section."""
req_data = _v150_manifest(q, "requirements_manifest.json")
if req_data is None:
return
records = req_data.get("records")
if not isinstance(records, list):
return # wrapper check already reported
fd_data = _v150_manifest(q, "formal_docs_manifest.json")
formal_docs_by_path = {}
if fd_data and isinstance(fd_data.get("records"), list):
for rec in fd_data["records"]:
if isinstance(rec, dict) and isinstance(rec.get("source_path"), str):
formal_docs_by_path[rec["source_path"]] = rec
for idx, rec in enumerate(records):
if not isinstance(rec, dict):
fail(
"requirements_manifest.json",
f"record_id=<#{idx}>: not a JSON object",
)
continue
req_id = rec.get("id", f"<#{idx}>")
fs = rec.get("functional_section")
if not isinstance(fs, str) or not fs.strip():
fail(
"requirements_manifest.json",
f"record_id={req_id}: has empty or missing functional_section "
"(schemas.md §10 invariant #8)",
)
tier = rec.get("tier")
citation = rec.get("citation")
if tier in (1, 2):
if not isinstance(citation, dict):
fail(
"requirements_manifest.json",
f"record_id={req_id}: is tier {tier} but has no citation block "
"(schemas.md §10 invariant #1)",
)
continue
_check_citation_block(repo_dir, req_id, citation, formal_docs_by_path, tier)
elif tier in (3, 4, 5):
if citation is not None:
fail(
"requirements_manifest.json",
f"record_id={req_id}: is tier {tier} but carries a citation block "
"(citations are for Tier 1/2 only per schemas.md §10 invariant #1)",
)
elif tier is None:
fail(
"requirements_manifest.json",
f"record_id={req_id}: missing 'tier' field",
)
else:
fail(
"requirements_manifest.json",
f"record_id={req_id}: has invalid tier {tier!r} (expected integer 15)",
)
# v1.5.2: validate the optional `pattern` field on the REQ record.
pattern = rec.get("pattern")
if pattern is not None and pattern not in VALID_PATTERN_VALUES:
fail(
"requirements_manifest.json",
f"record_id={req_id}: has invalid pattern {pattern!r} "
f"(expected one of {sorted(VALID_PATTERN_VALUES)})",
)
pass_("requirements_manifest.json: v1.5.1 Layer-1 REQ checks complete")
def check_v1_5_0_bugs_manifest(q):
"""§10 invariants #7, #12 — disposition completeness + legal fix_type × disposition."""
data = _v150_manifest(q, "bugs_manifest.json")
if data is None:
return
records = data.get("records")
if not isinstance(records, list):
return
for idx, rec in enumerate(records):
if not isinstance(rec, dict):
continue
bug_id = rec.get("id", f"<#{idx}>")
disp = rec.get("disposition")
if disp not in _V150_VALID_DISPOSITIONS:
fail(
"bugs_manifest.json",
f"record_id={bug_id}: has invalid or missing disposition {disp!r} "
f"(schemas.md §10 invariant #7, valid: "
f"{', '.join(_V150_VALID_DISPOSITIONS)})",
)
continue
rationale = rec.get("disposition_rationale")
if not isinstance(rationale, str) or not rationale.strip():
fail(
"bugs_manifest.json",
f"record_id={bug_id}: has empty or missing disposition_rationale "
"(schemas.md §10 invariant #7)",
)
ft = rec.get("fix_type")
if ft not in _V150_VALID_FIX_TYPES:
fail(
"bugs_manifest.json",
f"record_id={bug_id}: has invalid or missing fix_type {ft!r}",
)
continue
if (disp, ft) in _V150_ILLEGAL_FIX_PAIRS:
fail(
"bugs_manifest.json",
f"record_id={bug_id}: illegal disposition × fix_type combination "
f"({disp}, {ft}) per schemas.md §3.4 / §10 invariant #12",
)
pass_("bugs_manifest.json: v1.5.1 Layer-1 BUG checks complete")
def check_v1_5_0_index_md(q):
"""§10 invariant #10 — quality/INDEX.md exists with all §11 required fields.
v1.5.4 Part 1 / Round 1 Council finding C2-1 + Round 2 Council
finding C1: routes by INDEX payload.schema_version with explicit
handling for each case so future schemas don't silently downgrade.
- ``schema_version == SCHEMA_VERSION_CURRENT`` (currently
``"2.0"``) → the v1.5.4 contract; target_role_breakdown
required (null is legitimate for the stub before Phase 1).
- ``schema_version == "1.0"`` → legacy v1.5.3 archive;
target_project_type required; one WARN emitted.
- ``schema_version`` absent/empty AND payload carries
target_project_type without target_role_breakdown → legacy
WARN (heuristic fallback for pre-schema-version archives).
- ``schema_version`` absent/empty AND payload doesn't match the
legacy heuristic → current path; the run is treated as a
v1.5.4 stub that simply hasn't populated schema_version yet,
and target_role_breakdown is required.
- any other ``schema_version`` (e.g. ``"3.0"`` from a future
gate) → explicit FAIL "newer than supported" so the operator
knows to upgrade the gate or downgrade the run.
This keeps historical archives under quality/previous_runs/
legible without rewriting them retroactively while keeping the
gate strict on current runs.
"""
path = q / "INDEX.md"
v150_artifacts = (
"formal_docs_manifest.json",
"requirements_manifest.json",
"use_cases_manifest.json",
"bugs_manifest.json",
"citation_semantic_check.json",
)
is_v150_run = any((q / name).is_file() for name in v150_artifacts)
if not path.is_file():
if is_v150_run:
fail(
"quality/INDEX.md does not exist (required on every v1.5.1 run per "
"schemas.md §10 invariant #10)"
)
return
text = path.read_text(encoding="utf-8", errors="ignore")
match = re.search(r"```json\n(.*?)\n```", text, re.DOTALL)
if not match:
fail("quality/INDEX.md: no fenced JSON block found (schemas.md §11)")
return
try:
payload = json.loads(match.group(1))
except json.JSONDecodeError as exc:
fail(f"quality/INDEX.md: fenced JSON block invalid: {exc}")
return
if not isinstance(payload, dict):
fail("quality/INDEX.md: fenced JSON block is not a JSON object")
return
# Schema-version routing for INDEX.md (v1.5.4 Round 2 Council
# finding C1). Four cases, handled explicitly so future schemas
# don't silently downgrade to legacy:
# 1. schema_version == "1.0" -> legacy WARN
# 2. schema_version absent/empty AND the payload -> legacy WARN
# carries target_project_type but not (heuristic
# target_role_breakdown fallback for
# pre-schema-
# version
# archives)
# 3. schema_version == SCHEMA_VERSION_CURRENT -> current path
# 4. schema_version absent/empty AND the payload
# doesn't fit case 2 -> current path
# (FAIL on
# missing
# target_role_breakdown
# because the
# run is
# ambiguous and
# v1.5.4 is the
# live shape)
# 5. any other schema_version -> explicit FAIL
# "newer than
# supported"
schema_version = payload.get("schema_version")
if schema_version == "1.0":
is_legacy = True
elif schema_version in (None, ""):
is_legacy = (
"target_project_type" in payload
and "target_role_breakdown" not in payload
)
elif schema_version == SCHEMA_VERSION_CURRENT:
is_legacy = False
else:
fail(
f"quality/INDEX.md: schema_version {schema_version!r} is "
f"newer than this gate supports (current: "
f"{SCHEMA_VERSION_CURRENT!r}). Upgrade the gate or "
"downgrade the run."
)
return
if is_legacy:
warn(
f"quality/INDEX.md: schema_version={schema_version!r} treated as "
"legacy v1.5.3 archive (target_project_type contract). v1.5.4+ "
f"runs MUST emit schema_version={SCHEMA_VERSION_CURRENT!r} with "
"target_role_breakdown."
)
required = _V150_INDEX_COMMON_FIELDS + _V150_INDEX_LEGACY_FIELDS
else:
required = _V150_INDEX_COMMON_FIELDS + _V154_INDEX_CURRENT_FIELDS
for key in required:
if key not in payload:
fail(f"quality/INDEX.md: missing required field {key!r} (schemas.md §11)")
continue
val = payload[key]
if isinstance(val, str) and not val:
fail(f"quality/INDEX.md: field {key!r} is empty string (schemas.md §11)")
summary = payload.get("summary")
if isinstance(summary, dict):
for sub in _V150_REQUIRED_SUMMARY_KEYS:
if sub not in summary:
fail(
f"quality/INDEX.md: summary missing {sub!r} sub-key "
"(schemas.md §11)"
)
pass_("quality/INDEX.md: §11 fields present")
_V150_VALID_VERDICTS = ("supports", "overreaches", "unclear")
def check_v1_5_0_semantic_check(q):
"""§10 invariant #17 — Council-of-Three majority-overreaches rule.
Layer-2 semantic check (Phase 6). Gate does NOT re-run the semantic
review; it parses quality/citation_semantic_check.json and applies
the majority-overreaches rule:
- ≥2 of 3 `overreaches` for the same Tier 1/2 REQ → FAIL.
- isolated 1/3 `overreaches` or `unclear` → WARN.
- <3 reviews for any Tier 1/2 REQ → FAIL (schemas.md §9.4).
- review entry for a Tier 3/4/5 REQ → FAIL (only Tier 1/2 are
semantically reviewable since they carry citations).
When requirements_manifest.json has zero Tier 1/2 REQs the
citation_semantic_check.json file is still expected (emitted with
empty reviews[]); its absence in that case warns rather than
fails to avoid breaking Spec Gap runs.
"""
req_data = _v150_manifest(q, "requirements_manifest.json")
tier_by_req = {}
if req_data and isinstance(req_data.get("records"), list):
for rec in req_data["records"]:
if isinstance(rec, dict):
rid = rec.get("id")
tier = rec.get("tier")
if isinstance(rid, str) and isinstance(tier, int) and not isinstance(tier, bool):
tier_by_req[rid] = tier
tier_12_req_ids = {rid for rid, t in tier_by_req.items() if t in (1, 2)}
sc_path = q / "citation_semantic_check.json"
if not sc_path.is_file():
if tier_12_req_ids:
fail(
"quality/citation_semantic_check.json",
"file missing (schemas.md §10 invariant #17 requires a semantic "
"check for every Tier 1/2 REQ)",
)
else:
# Spec Gap: no Tier 1/2 REQs to review. File is expected but its
# absence doesn't break the invariant since there's nothing to
# enforce. Warn so the orchestrator knows to emit the empty file.
warn(
"quality/citation_semantic_check.json: file missing; no Tier 1/2 "
"REQs present so invariant #17 has nothing to enforce — emit an "
"empty reviews[] for contract completeness"
)
return
data = _v150_manifest(q, "citation_semantic_check.json")
if data is None:
return # wrapper check already reported the failure
reviews = data.get("reviews")
if not isinstance(reviews, list):
return # wrapper check already reported
by_req = {}
seen_reviewers = {}
for idx, entry in enumerate(reviews):
if not isinstance(entry, dict):
fail(
"citation_semantic_check.json",
f"reviews[#{idx}]: not a JSON object",
)
continue
rid = entry.get("req_id")
reviewer = entry.get("reviewer")
verdict = entry.get("verdict")
notes = entry.get("notes")
if not isinstance(rid, str) or not rid:
fail(
"citation_semantic_check.json",
f"reviews[#{idx}]: missing or non-string req_id",
)
continue
if not isinstance(reviewer, str) or not reviewer:
fail(
"citation_semantic_check.json",
f"record_id={rid}: missing or non-string reviewer",
)
continue
if verdict not in _V150_VALID_VERDICTS:
fail(
"citation_semantic_check.json",
f"record_id={rid}: reviewer={reviewer!r} invalid verdict "
f"{verdict!r}; expected one of {_V150_VALID_VERDICTS}",
)
continue
if not isinstance(notes, str):
fail(
"citation_semantic_check.json",
f"record_id={rid}: reviewer={reviewer!r} notes must be a string",
)
continue
# §9.4 common-mistake: tier check — review entries must belong to
# Tier 1/2 REQs only.
tier = tier_by_req.get(rid)
if tier is None:
fail(
"citation_semantic_check.json",
f"record_id={rid}: reviewer={reviewer!r} reviews a REQ that does "
"not exist in requirements_manifest.json",
)
continue
if tier not in (1, 2):
fail(
"citation_semantic_check.json",
f"record_id={rid}: reviewer={reviewer!r} reviews a tier-{tier} "
"REQ; semantic check applies to Tier 1/2 only (schemas.md §9.4)",
)
continue
# Detect duplicate (req_id, reviewer) pairs — a typo that would slip a
# vote past the majority computation.
pair_key = seen_reviewers.setdefault(rid, set())
if reviewer in pair_key:
fail(
"citation_semantic_check.json",
f"record_id={rid}: duplicate review from reviewer={reviewer!r}",
)
continue
pair_key.add(reviewer)
by_req.setdefault(rid, []).append(entry)
# §9.4: every Tier 1/2 REQ needs at least 3 reviews.
for rid in sorted(tier_12_req_ids):
entries = by_req.get(rid, [])
if len(entries) < 3:
fail(
"citation_semantic_check.json",
f"record_id={rid}: fewer than 3 reviews ({len(entries)} present) "
"— schemas.md §9.4 requires one entry per council member for "
"every Tier 1/2 REQ",
)
continue
overreach_count = sum(1 for e in entries if e.get("verdict") == "overreaches")
unclear_count = sum(1 for e in entries if e.get("verdict") == "unclear")
if overreach_count >= 2:
reviewers_flagged = ", ".join(
sorted(
str(e.get("reviewer"))
for e in entries
if e.get("verdict") == "overreaches"
)
)
fail(
"citation_semantic_check.json",
f"record_id={rid}: semantic check majority overreaches "
f"({overreach_count}/{len(entries)} reviewers flagged: "
f"{reviewers_flagged}) — schemas.md §10 invariant #17",
)
elif overreach_count == 1:
flagged = next(
str(e.get("reviewer"))
for e in entries
if e.get("verdict") == "overreaches"
)
warn(
f"citation_semantic_check.json: record_id={rid}: 1/{len(entries)} "
f"reviewer ({flagged}) flagged as `overreaches` — surfaced for "
"human review; not a gate failure unless ≥2 agree"
)
if unclear_count >= 1 and overreach_count == 0:
flagged = ", ".join(
sorted(
str(e.get("reviewer"))
for e in entries
if e.get("verdict") == "unclear"
)
)
warn(
f"citation_semantic_check.json: record_id={rid}: "
f"{unclear_count}/{len(entries)} reviewer(s) flagged as "
f"`unclear` ({flagged}) — surfaced for human review"
)
if not tier_12_req_ids:
pass_(
"citation_semantic_check.json: no Tier 1/2 REQs to review "
"(invariant #17 vacuously satisfied)"
)
else:
pass_(
f"citation_semantic_check.json: §10 invariant #17 checks complete "
f"for {len(tier_12_req_ids)} Tier 1/2 REQ(s)"
)
# --- v1.5.1 Item 5.2: challenge-gate coverage invariant -------------------
# Canonical verdict-line regex from Impl-Plan Item 5.2. Matches a top-level
# "**Verdict:** CONFIRMED/DOWNGRADED/REJECTED" line as a stand-alone line.
_CHALLENGE_VERDICT_RE = re.compile(
r"^\*\*Verdict:\*\*\s+(CONFIRMED|DOWNGRADED|REJECTED)\s*$",
re.MULTILINE,
)
# Legacy final-verdict form used by challenge records generated before the
# canonical regex was specified (including the preserved virtio-1.4.6
# evidence at repos/benchmark-1.5.0/virtio-1.4.6/quality/challenge/).
# The briefing says "this invariant only verifies the challenge ran" — the
# legacy form unambiguously records a final verdict, so it satisfies the
# invariant's intent without requiring operators to regenerate baseline
# artifacts. New v1.5.1+ runs should prefer the canonical form.
_CHALLENGE_VERDICT_LEGACY_RE = re.compile(
r"^\*\*(CONFIRMED|DOWNGRADED|REJECTED)\.?\*\*",
re.MULTILINE,
)
# Trigger-pattern keyword tables (case-insensitive substring matching).
_CHALLENGE_SECURITY_SEVERITIES = frozenset({"CRITICAL", "HIGH"})
_CHALLENGE_SECURITY_KEYWORDS = (
"credential", "secret", "auth", "injection", "xss", "csrf",
"ssrf", "privilege", "bypass", "leak",
)
_CHALLENGE_SIBLING_KEYWORDS = (
"sibling", "parallel", "parity", "contrasted with", "same concern",
"in contrast", "other path", "other branch",
)
_CHALLENGE_MISSING_KEYWORDS = (
"never", "does not", "doesn't", "missing", "absent", "fails to",
)
_CHALLENGE_DESIGN_KEYWORDS = (
"todo", "why", "ooda", "design decision",
)
_CHALLENGE_ITERATION_KEYWORDS = (
"gap", "unfiltered", "parity", "adversarial", "iteration",
)
def _bug_writeup_text(q, bug_id):
"""Return lowercased writeup text for ``bug_id`` (empty string if absent).
Writeups live at quality/writeups/BUG-NNN.md. Reading failures are
treated as empty text — the invariant still runs on the manifest fields
(title / summary / source) which are present independently.
"""
path = _resolve_artifact_path(q, f"writeups/{bug_id}.md")
if not path.is_file():
return ""
try:
return path.read_text(encoding="utf-8", errors="ignore").lower()
except OSError:
return ""
def _bug_req_has_tier_12_citation(req_id, requirements_records):
"""True iff req_id resolves to a REQ with a non-empty citation and
tier in {1, 2}. Used by the "No spec basis" trigger pattern."""
if not req_id or not isinstance(requirements_records, list):
return False
for rec in requirements_records:
if not isinstance(rec, dict):
continue
if rec.get("id") != req_id:
continue
if rec.get("tier") not in (1, 2):
return False
citation = rec.get("citation")
if isinstance(citation, dict) and citation:
return True
return False
return False
def _contains_any(text, keywords):
"""Case-insensitive substring OR across a keyword tuple."""
if not text:
return False
lowered = text.lower()
return any(kw in lowered for kw in keywords)
def _classify_bug_triggers(rec, q, requirements_records):
"""Return the list of trigger-pattern names that fired for one bug.
Empty list means the bug does not require a challenge record.
Patterns mirror Impl-Plan Item 5.2 verbatim. Input aliasing:
- title: prefers rec['title'], falls back to rec['summary'].
- requirement: prefers rec['requirement'], falls back to rec['req_id']
(v1.4.x uses req_id; v1.5.1+ converges on requirement).
- source_comments: optional, older runs may omit it.
- source / discovery_phase: substring-matched against the
iteration-derived keyword list.
"""
fired = []
bug_id = rec.get("id", "")
title = rec.get("title") or rec.get("summary") or ""
severity = (rec.get("severity") or "").upper()
writeup = _bug_writeup_text(q, bug_id) if bug_id else ""
title_plus_writeup = f"{title}\n{writeup}"
# 1. Security-class.
if severity in _CHALLENGE_SECURITY_SEVERITIES and _contains_any(
title_plus_writeup, _CHALLENGE_SECURITY_KEYWORDS
):
fired.append("security-class")
# 2. No spec basis.
requirement = rec.get("requirement") or rec.get("req_id")
has_valid_citation = _bug_req_has_tier_12_citation(requirement, requirements_records)
if not requirement or not has_valid_citation:
fired.append("no-spec-basis")
# 3. Sibling-path divergence.
if _contains_any(writeup, _CHALLENGE_SIBLING_KEYWORDS):
fired.append("sibling-path-divergence")
# 4. Missing functionality.
if _contains_any(writeup, _CHALLENGE_MISSING_KEYWORDS):
fired.append("missing-functionality")
# 5. Design-decision comment (optional field).
source_comments = rec.get("source_comments")
if isinstance(source_comments, str) and _contains_any(
source_comments, _CHALLENGE_DESIGN_KEYWORDS
):
fired.append("design-decision-comment")
# 6. Iteration-derived.
source = rec.get("source") or ""
discovery_phase = rec.get("discovery_phase") or ""
iter_haystack = f"{source}\n{discovery_phase}"
if _contains_any(iter_haystack, _CHALLENGE_ITERATION_KEYWORDS):
fired.append("iteration-derived")
return fired
def _challenge_record_has_verdict(path):
"""True iff the file exists and contains either the canonical or
legacy verdict line per the invariant's accept set."""
if not path.is_file():
return False
try:
text = path.read_text(encoding="utf-8", errors="ignore")
except OSError:
return False
if _CHALLENGE_VERDICT_RE.search(text):
return True
if _CHALLENGE_VERDICT_LEGACY_RE.search(text):
return True
return False
def check_challenge_gate_coverage(q):
"""v1.5.1 Item 5.2 — every bug whose fingerprints trigger the challenge
gate must have a quality/challenge/BUG-NNN-challenge.md with a valid
verdict line.
N/A when quality/bugs_manifest.json is absent (zero-bug runs can't
have un-challenged bugs). Runs on the current quality/ tree only;
no cross-run state.
"""
data = _v150_manifest(q, "bugs_manifest.json")
if data is None:
# N/A — the plan explicitly says "invariant is N/A if the file is
# absent". Consistent with other quality_gate invariants that silently
# skip when their input isn't present.
return
records = data.get("records")
if not isinstance(records, list):
return
reqs_data = _v150_manifest(q, "requirements_manifest.json") or {}
req_records = reqs_data.get("records") if isinstance(reqs_data, dict) else None
challenge_dir = q / "challenge"
triggered = 0
missing = [] # list of (bug_id, [pattern names]) for bugs with no record
bad_verdict = [] # list of (bug_id, [pattern names]) for record w/o verdict
for rec in records:
if not isinstance(rec, dict):
continue
bug_id = rec.get("id")
if not bug_id:
continue
fired = _classify_bug_triggers(rec, q, req_records)
if not fired:
continue
triggered += 1
record_path = challenge_dir / f"{bug_id}-challenge.md"
if not record_path.is_file():
missing.append((bug_id, fired))
elif not _challenge_record_has_verdict(record_path):
bad_verdict.append((bug_id, fired))
if missing:
for bug_id, fired in missing:
fail(
"quality/challenge/",
f"{bug_id}: challenge record missing (triggered by: {', '.join(fired)}) "
f"— expected {bug_id}-challenge.md with a **Verdict:** line",
)
if bad_verdict:
for bug_id, fired in bad_verdict:
fail(
f"quality/challenge/{bug_id}-challenge.md",
f"missing or malformed verdict line (triggered by: {', '.join(fired)}) "
"— expected a line matching `^\\*\\*Verdict:\\*\\*\\s+(CONFIRMED|DOWNGRADED|REJECTED)` "
"or the legacy final-verdict form",
)
if triggered == 0:
pass_("challenge gate coverage: no bug triggered the challenge gate (vacuous)")
elif not missing and not bad_verdict:
pass_(
f"challenge gate coverage: {triggered} triggered bug(s) all have valid "
"challenge records"
)
def check_v1_5_3_formal_doc_role_validation(q):
"""schemas.md §10 invariant #23 — FORMAL_DOC.role on v1.5.3-shaped manifests.
Legacy manifest (no v1.5.3 fields anywhere): one WARN, then skip.
v1.5.3-shaped: every record MUST have role populated with a member of
formal_doc_role (§3.6).
"""
data = _v150_manifest(q, "formal_docs_manifest.json")
if data is None:
return
records = data.get("records")
if not isinstance(records, list):
return # wrapper check already reported
if not _is_v1_5_3_shaped(data):
warn(
"formal_docs_manifest.json: legacy manifest detected; treating absent "
"FORMAL_DOC.role as 'external-spec' per schemas.md §3.10 backward-compat rule"
)
return
any_fail = False
for idx, rec in enumerate(records):
if not isinstance(rec, dict):
continue
rec_id = rec.get("source_path", f"<#{idx}>")
role = rec.get("role")
if role not in _V153_VALID_FORMAL_DOC_ROLES:
fail(
"formal_docs_manifest.json",
f"record_id={rec_id}: missing or invalid role {role!r} on "
f"v1.5.3-shaped manifest (schemas.md §10 invariant #23, valid: "
f"{', '.join(_V153_VALID_FORMAL_DOC_ROLES)})",
)
any_fail = True
if not any_fail:
pass_("formal_docs_manifest.json: v1.5.3 role validation complete")
def check_v1_5_3_source_type_validation(q):
"""schemas.md §10 invariants #21 (first part) — REQ.source_type presence.
Legacy manifest: one WARN, then skip.
v1.5.3-shaped: every REQ MUST have source_type populated with a member
of req_source_type (§3.7).
"""
data = _v150_manifest(q, "requirements_manifest.json")
if data is None:
return
records = data.get("records")
if not isinstance(records, list):
return
if not _is_v1_5_3_shaped(data):
warn(
"requirements_manifest.json: legacy manifest detected; treating absent "
"REQ.source_type as 'code-derived' per schemas.md §3.10 backward-compat rule"
)
return
any_fail = False
for idx, rec in enumerate(records):
if not isinstance(rec, dict):
continue
req_id = rec.get("id", f"<#{idx}>")
source_type = rec.get("source_type")
if source_type not in _V153_VALID_SOURCE_TYPES:
fail(
"requirements_manifest.json",
f"record_id={req_id}: missing or invalid source_type "
f"{source_type!r} on v1.5.3-shaped manifest "
f"(schemas.md §10 invariant #21, valid: "
f"{', '.join(_V153_VALID_SOURCE_TYPES)})",
)
any_fail = True
if not any_fail:
pass_("requirements_manifest.json: v1.5.3 source_type validation complete")
def check_v1_5_3_skill_section_consistency(q):
"""schemas.md §10 invariant #21 (second part) — skill_section consistency.
On a v1.5.3-shaped requirements manifest, REQs with
source_type == 'skill-section' MUST have non-empty skill_section;
REQs with any other source_type value MUST have skill_section absent
or null (per §1.5: optional fields may be omitted or present as null).
Populated skill_section paired with non-skill-section source_type FAILs.
Legacy manifests are skipped silently here -- the source_type check
already emitted the single WARN for the manifest.
Deliberate piggyback (Round 2 Council, item 1): this is the one
documented exception to the "exactly one WARN per check function"
convention used by the other three v1.5.3 invariants. Both
check_v1_5_3_source_type_validation and this check share
requirements_manifest.json, so emitting a second WARN here would
double-warn for the same legacy file. The piggyback is locked in
by test_legacy_manifest_silently_skips in
TestV153SkillSectionConsistency -- a future maintainer reading the
brief and adding a WARN for consistency would break that test.
"""
data = _v150_manifest(q, "requirements_manifest.json")
if data is None:
return
records = data.get("records")
if not isinstance(records, list):
return
if not _is_v1_5_3_shaped(data):
return # source_type check handled the soft warn for this manifest
any_fail = False
for idx, rec in enumerate(records):
if not isinstance(rec, dict):
continue
req_id = rec.get("id", f"<#{idx}>")
source_type = rec.get("source_type")
skill_section = rec.get("skill_section")
if source_type == "skill-section":
if not isinstance(skill_section, str) or not skill_section.strip():
fail(
"requirements_manifest.json",
f"record_id={req_id}: source_type='skill-section' but "
f"skill_section is empty or missing "
"(schemas.md §10 invariant #21)",
)
any_fail = True
else:
if skill_section is not None and skill_section != "":
fail(
"requirements_manifest.json",
f"record_id={req_id}: skill_section={skill_section!r} "
f"populated but source_type={source_type!r} is not "
"'skill-section' (schemas.md §10 invariant #21)",
)
any_fail = True
if not any_fail:
pass_("requirements_manifest.json: v1.5.3 skill_section consistency complete")
def check_v1_5_3_divergence_type_validation(q):
"""schemas.md §10 invariant #22 — BUG.divergence_type on v1.5.3-shaped manifests.
Legacy manifest: one WARN, then skip.
v1.5.3-shaped: every BUG MUST have divergence_type populated with a
member of bug_divergence_type (§3.8).
"""
data = _v150_manifest(q, "bugs_manifest.json")
if data is None:
return
records = data.get("records")
if not isinstance(records, list):
return
if not _is_v1_5_3_shaped(data):
warn(
"bugs_manifest.json: legacy manifest detected; treating absent "
"BUG.divergence_type as 'code-spec' per schemas.md §3.10 backward-compat rule"
)
return
any_fail = False
for idx, rec in enumerate(records):
if not isinstance(rec, dict):
continue
bug_id = rec.get("id", f"<#{idx}>")
divergence_type = rec.get("divergence_type")
if divergence_type not in _V153_VALID_DIVERGENCE_TYPES:
fail(
"bugs_manifest.json",
f"record_id={bug_id}: missing or invalid divergence_type "
f"{divergence_type!r} on v1.5.3-shaped manifest "
f"(schemas.md §10 invariant #22, valid: "
f"{', '.join(_V153_VALID_DIVERGENCE_TYPES)})",
)
any_fail = True
if not any_fail:
pass_("bugs_manifest.json: v1.5.3 divergence_type validation complete")
_V153_COUNCIL_INBOX_ITEM_TYPES = frozenset({
"rejected-draft",
"tier-5-demotion",
"zero-req-section",
"weak-rationale",
})
def check_v1_5_3_council_inbox_validation(q):
"""Phase 3b BLOCK-4 cross-reference + DQ-5 structural validation.
Validates quality/phase3/pass_d_council_inbox.json against the
DQ-5 schema AND verifies that every Pass D rejection / Tier-5
demotion has a matching council-inbox item. Without the
cross-reference invariant, a syntactically-valid but functionally
-empty inbox could pass while pass_d_audit.json shows 30+
rejections -- the inbox population could silently break and the
gate would not catch it.
Two failure modes:
1. Structural -- malformed item record, invalid item_type,
missing required field per the DQ-5 schema.
2. Cross-reference -- pass_d_audit.json entry with outcome in
{rejected, demoted_to_tier_5} has no matching item in the
inbox.
Phase 3 artifact set is at <repo>/quality/phase3/, NOT at the
top-level <repo>/quality/. The check returns silently if the
phase3 directory does not exist (the project is Code-only or
Phase 3 has not been run yet).
"""
phase3_dir = _resolve_artifact_path(q, "phase3")
if not phase3_dir.is_dir():
return # phase 3 not run; not in scope for this manifest set
inbox_path = phase3_dir / "pass_d_council_inbox.json"
audit_path = phase3_dir / "pass_d_audit.json"
if not inbox_path.is_file():
return # phase 3 partially run; skip silently
inbox_data = load_json(inbox_path)
if not isinstance(inbox_data, dict):
fail(f"{inbox_path.name}: not a valid JSON object")
return
# Structural validation.
schema_version = inbox_data.get("schema_version")
if schema_version != "1.0":
fail(
f"{inbox_path.name}: schema_version {schema_version!r} "
"does not match the DQ-5 spec value '1.0'"
)
items = inbox_data.get("items")
if not isinstance(items, list):
fail(f"{inbox_path.name}: 'items' is missing or not a list")
return
required_fields = {
"item_type",
"draft_idx",
"section_idx",
"section_heading",
"rationale",
"context_excerpt",
"provisional_disposition",
}
for idx, item in enumerate(items):
if not isinstance(item, dict):
fail(f"{inbox_path.name}: item #{idx} is not a JSON object")
continue
missing = required_fields - set(item.keys())
if missing:
fail(
f"{inbox_path.name}: item #{idx} is missing required "
f"DQ-5 fields: {sorted(missing)}"
)
if item.get("item_type") not in _V153_COUNCIL_INBOX_ITEM_TYPES:
fail(
f"{inbox_path.name}: item #{idx} has invalid item_type "
f"{item.get('item_type')!r} (valid: "
f"{sorted(_V153_COUNCIL_INBOX_ITEM_TYPES)})"
)
rationale = item.get("rationale")
if not isinstance(rationale, str) or not rationale.strip():
fail(
f"{inbox_path.name}: item #{idx} has empty or missing "
"rationale"
)
# Cross-reference invariant: every rejected / demoted audit entry
# must have a matching inbox item by (draft_idx, item_type).
if audit_path.is_file():
audit_data = load_json(audit_path)
if isinstance(audit_data, dict):
inbox_pairs = {
(item.get("draft_idx"), item.get("item_type"))
for item in items
if isinstance(item, dict)
}
for entry in audit_data.get("rejected", []) or []:
if not isinstance(entry, dict):
continue
pair = (entry.get("draft_idx"), "rejected-draft")
if pair not in inbox_pairs:
fail(
f"{inbox_path.name}: pass_d_audit.json shows "
f"rejected draft_idx={entry.get('draft_idx')} "
"but there is no matching rejected-draft item "
"in the council inbox (BLOCK-4 cross-reference "
"invariant violation)"
)
for entry in audit_data.get("demoted_to_tier_5", []) or []:
if not isinstance(entry, dict):
continue
pair = (entry.get("draft_idx"), "tier-5-demotion")
if pair not in inbox_pairs:
fail(
f"{inbox_path.name}: pass_d_audit.json shows "
f"tier-5 demotion at draft_idx={entry.get('draft_idx')} "
"but there is no matching tier-5-demotion item "
"in the council inbox"
)
pass_(f"{inbox_path.name}: v1.5.3 council inbox validation complete")
# ---------------------------------------------------------------------------
# Phase 4 skill-project gate enforcement checks (DQ-4-4).
#
# These four checks fire when the target's role map shows skill-prose
# surface; they SKIP (informational `INFO: skipped` line, no fail
# counter increment) on pure-code targets. The check that always runs
# is check_role_map_consistency.
#
# v1.5.4 Part 1: the legacy Code/Skill/Hybrid string is now derived
# from the Phase-1 role map at <q>/exploration_role_map.json. The
# mapping mirrors bin/role_map.py::derive_legacy_project_type. If the
# role map is absent, all four checks SKIP silently — Phase 1 has not
# been run yet on this target. The gate ships into target repos as a
# stdlib-only script and cannot import bin/role_map; the small amount
# of role-map awareness it needs is inlined below.
# ---------------------------------------------------------------------------
def _load_role_map(q):
"""Return the parsed exploration_role_map.json dict, or None when
absent / unparsable. v1.5.4 inline replacement for the prior
project_type.json reader."""
return load_json(q / "exploration_role_map.json")
def _role_map_has_role(role_map, role_set):
if not isinstance(role_map, dict):
return False
files = role_map.get("files") or []
if not isinstance(files, list):
return False
for entry in files:
if isinstance(entry, dict) and entry.get("role") in role_set:
return True
return False
def _phase4_project_type(q):
"""Return the v1.5.3-equivalent classification string ('Code' /
'Skill' / 'Hybrid') derived from the Phase-1 role map, or None
when the role map is absent / unparsable.
Mapping (mirrors bin/role_map.derive_legacy_project_type):
- has skill-prose AND has code -> 'Hybrid'
- has skill-prose, no code -> 'Skill'
- no skill-prose -> 'Code'
"""
role_map = _load_role_map(q)
if role_map is None:
return None
skill = _role_map_has_role(role_map, ("skill-prose", "skill-reference"))
code = _role_map_has_role(role_map, ("code",))
if skill and code:
return "Hybrid"
if skill:
return "Skill"
return "Code"
def check_skill_section_req_coverage(repo_dir, q):
"""Skill / Hybrid: every operational SKILL.md section per
pass_d_section_coverage.json has ≥1 promoted REQ. Meta-allowlist
sections are exempt (their section_kind == 'meta').
SKIPS for Code projects."""
print("[Phase 4: skill-section REQ coverage]")
classification = _phase4_project_type(q)
if classification not in ("Skill", "Hybrid"):
info(f"check_skill_section_req_coverage: skip (project_type={classification!r})")
return
coverage_path = _resolve_artifact_path(q, "phase3/pass_d_section_coverage.json")
data = load_json(coverage_path)
if not isinstance(data, dict):
info(
"check_skill_section_req_coverage: skip "
"(pass_d_section_coverage.json missing or unparsable)"
)
return
failures = 0
for s in data.get("sections", []) or []:
if not isinstance(s, dict):
continue
kind = s.get("section_kind")
if kind != "operational":
continue
promoted = s.get("drafts_promoted", 0) or 0
if promoted < 1:
heading = s.get("heading") or "<unknown>"
document = s.get("document") or "SKILL.md"
section_idx = s.get("section_idx")
fail(
f"{document}",
f"section #{section_idx} {heading!r} has 0 promoted "
"REQs and is not in the meta allowlist "
"(check_skill_section_req_coverage)",
)
failures += 1
if failures == 0:
pass_("check_skill_section_req_coverage: every operational section has ≥1 promoted REQ")
def check_reference_file_req_coverage(repo_dir, q):
"""Skill / Hybrid: every reference file under references/ has ≥1
REQ citing it OR a `<!-- non-normative -->` marker in its first
5 lines.
SKIPS for Code projects."""
print("[Phase 4: reference-file REQ coverage]")
classification = _phase4_project_type(q)
if classification not in ("Skill", "Hybrid"):
info(f"check_reference_file_req_coverage: skip (project_type={classification!r})")
return
references_dir = repo_dir / "references"
if not references_dir.is_dir():
info("check_reference_file_req_coverage: skip (no references/ directory)")
return
formal_path = _resolve_artifact_path(q, "phase3/pass_c_formal.jsonl")
if not formal_path.is_file():
info(
"check_reference_file_req_coverage: skip "
"(pass_c_formal.jsonl missing — Phase 3 not run yet)"
)
return
cited_documents = set()
for line in formal_path.read_text(encoding="utf-8").splitlines():
if not line.strip():
continue
try:
rec = json.loads(line)
except json.JSONDecodeError:
continue
if not isinstance(rec, dict):
continue
sd = rec.get("source_document")
if isinstance(sd, str):
cited_documents.add(sd)
failures = 0
for ref in sorted(references_dir.glob("*.md")):
rel = f"references/{ref.name}"
if rel in cited_documents:
continue
# Non-normative marker check (first 5 lines).
head = ref.read_text(encoding="utf-8", errors="replace").splitlines()[:5]
if any("<!-- non-normative -->" in line.lower() for line in head):
continue
fail(
rel,
"no REQ cites this reference file and no <!-- non-normative --> "
"marker in its first 5 lines (check_reference_file_req_coverage)",
)
failures += 1
if failures == 0:
pass_("check_reference_file_req_coverage: every reference file has ≥1 citing REQ or non-normative marker")
def check_hybrid_cross_cutting_reqs(repo_dir, q):
"""Hybrid only: ≥1 REQ has triangulated evidence —
`source_type=skill-section` AND its acceptance_criteria references
a code artifact mentioned in another REQ with
`source_type=code-derived`.
SKIPS for Skill or Code projects."""
print("[Phase 4: hybrid cross-cutting REQs]")
classification = _phase4_project_type(q)
if classification != "Hybrid":
info(f"check_hybrid_cross_cutting_reqs: skip (project_type={classification!r})")
return
formal_path = _resolve_artifact_path(q, "phase3/pass_c_formal.jsonl")
if not formal_path.is_file():
info(
"check_hybrid_cross_cutting_reqs: skip "
"(pass_c_formal.jsonl missing — Phase 3 not run yet)"
)
return
skill_section_reqs = []
code_derived_artifacts = set()
for line in formal_path.read_text(encoding="utf-8").splitlines():
if not line.strip():
continue
try:
rec = json.loads(line)
except json.JSONDecodeError:
continue
if not isinstance(rec, dict):
continue
st = rec.get("source_type")
if st == "skill-section":
skill_section_reqs.append(rec)
elif st == "code-derived":
ac = (rec.get("acceptance_criteria") or "")
cite = (rec.get("citation_excerpt") or "")
for token in re.findall(
r"\b([\w./-]+\.(?:py|sh|json))\b", ac + " " + cite
):
code_derived_artifacts.add(token)
if not code_derived_artifacts:
# On a Hybrid project that hasn't yet produced any code-derived
# REQs, the cross-cutting check has nothing to triangulate
# against. INFO + skip rather than fail (the absence is the
# diagnostic).
info(
"check_hybrid_cross_cutting_reqs: skip "
"(no code-derived REQs in pass_c_formal.jsonl yet)"
)
return
triangulated = 0
for rec in skill_section_reqs:
ac = (rec.get("acceptance_criteria") or "") + " " + (
rec.get("citation_excerpt") or ""
)
if any(art in ac for art in code_derived_artifacts):
triangulated += 1
if triangulated >= 1:
break
if triangulated >= 1:
pass_(
f"check_hybrid_cross_cutting_reqs: triangulated evidence "
f"present (≥{triangulated} skill-section REQ references a "
"code-derived artifact)"
)
else:
fail(
"pass_c_formal.jsonl",
"Hybrid project has no triangulated REQ pair "
"(skill-section REQ referencing a code-derived artifact); "
"check_hybrid_cross_cutting_reqs",
)
def check_role_map_consistency(repo_dir, q):
"""All projects: exploration_role_map.json (when present) parses as
a JSON object, declares schema_version '1.0', carries a 'files'
list and a 'breakdown.percentages' dict with the four expected
share keys.
SKIPS silently when the role map is absent — Phase 1 has not been
run yet on this target. v1.5.4 Part 1 replacement for the v1.5.3
check_project_type_consistency, which keyed on
quality/project_type.json (now retired)."""
print("[Phase 4: role-map consistency]")
rm_path = q / "exploration_role_map.json"
if not rm_path.is_file():
info(
"check_role_map_consistency: skip "
"(exploration_role_map.json absent — Phase 1 not run yet)"
)
return
data = load_json(rm_path)
if not isinstance(data, dict):
fail(
f"{rm_path.relative_to(q.parent)}",
"exploration_role_map.json is not a valid JSON object",
)
return
if data.get("schema_version") != "1.0":
fail(
f"{rm_path.relative_to(q.parent)}",
f"schema_version {data.get('schema_version')!r} is not '1.0' "
"(check_role_map_consistency)",
)
return
files = data.get("files")
if not isinstance(files, list):
fail(
f"{rm_path.relative_to(q.parent)}",
"'files' is not a list (check_role_map_consistency)",
)
return
breakdown = data.get("breakdown")
if not isinstance(breakdown, dict):
fail(
f"{rm_path.relative_to(q.parent)}",
"'breakdown' is not an object (check_role_map_consistency)",
)
return
percentages = breakdown.get("percentages")
if not isinstance(percentages, dict):
fail(
f"{rm_path.relative_to(q.parent)}",
"'breakdown.percentages' is not an object "
"(check_role_map_consistency)",
)
return
missing = [
k for k in ("skill_share", "code_share", "tool_share", "other_share")
if k not in percentages
]
if missing:
fail(
f"{rm_path.relative_to(q.parent)}",
f"breakdown.percentages missing keys: {missing} "
"(check_role_map_consistency)",
)
return
derived = _phase4_project_type(q) or "Unknown"
pass_(
f"{rm_path.relative_to(q.parent)}: role map well-formed "
f"(legacy-derived project type {derived!r}; "
"check_role_map_consistency)"
)
def check_v1_5_2_cardinality_gate(repo_dir):
"""v1.5.2 Lever 3: Phase 5 cardinality reconciliation gate.
Surfaces every failure from validate_cardinality_gate() as a fail() entry.
"""
failures = validate_cardinality_gate(repo_dir)
if not failures:
pass_("compensation_grid.json: v1.5.2 cardinality gate clean")
return
for msg in failures:
fail("compensation_grid.json", msg)
def check_v1_5_0_gate_invariants(repo_dir, q):
"""Dispatcher that runs every Layer-1 mechanical check from schemas.md §10."""
check_v1_5_0_cite_extensions(repo_dir)
check_v1_5_0_manifest_wrappers(q)
check_v1_5_0_requirements_manifest(repo_dir, q)
check_v1_5_0_bugs_manifest(q)
check_v1_5_0_index_md(q)
# Phase 6 invariant #17 runs after requirements_manifest so it sees
# shape-validated REQ records.
check_v1_5_0_semantic_check(q)
# v1.5.1 Item 5.2: challenge-gate coverage runs last. It depends on
# requirements_manifest.json for the "No spec basis" pattern but
# does not redo schema checks that the prior invariants already cover.
check_challenge_gate_coverage(q)
# v1.5.2 Lever 3: cardinality reconciliation gate.
check_v1_5_2_cardinality_gate(repo_dir)
# v1.5.3 Phase 2: schema extensions for skill-aware projects (Code projects
# with legacy manifests hit the soft-warn path; v1.5.3-shaped manifests
# validate strictly per schemas.md §10 invariants #21#23).
check_v1_5_3_formal_doc_role_validation(q)
check_v1_5_3_source_type_validation(q)
check_v1_5_3_skill_section_consistency(q)
check_v1_5_3_divergence_type_validation(q)
# v1.5.3 Phase 3b: council inbox structural + cross-reference
# validation (DQ-5 + BLOCK-4). No-op for Code projects (phase3
# directory is absent).
check_v1_5_3_council_inbox_validation(q)
# v1.5.3 Phase 4 (DQ-4-4): skill-project gate enforcement. The
# first three SKIP for code-only projects (no skill-prose surface
# in the role map); check_role_map_consistency runs for all
# projects. v1.5.4 Part 1: project_type derived from the Phase-1
# role map instead of the retired project_type.json.
check_skill_section_req_coverage(repo_dir, q)
check_reference_file_req_coverage(repo_dir, q)
check_hybrid_cross_cutting_reqs(repo_dir, q)
check_role_map_consistency(repo_dir, q)
def check_repo(repo_dir, version_arg, strictness):
"""Run all checks for one repo. Writes output via pass_/fail_/warn/info."""
repo_dir = Path(repo_dir)
if str(repo_dir) == ".":
repo_dir = Path.cwd()
repo_name = repo_dir.name
q = repo_dir / "quality"
print("")
print(f"=== {repo_name} ===")
check_file_existence(repo_dir, q, strictness)
bug_count, bug_ids = check_bugs_heading(q)
tdd_data = check_tdd_sidecar(q, bug_count)
check_tdd_logs(q, bug_count, bug_ids, tdd_data)
check_integration_sidecar(q, strictness)
check_recheck_sidecar(q)
check_use_cases(repo_dir, q, strictness)
check_test_file_extension(repo_dir, q)
check_terminal_gate(q)
check_mechanical(q)
check_patches(q, bug_count, bug_ids, strictness)
check_writeups(q, bug_count)
skill_version = check_version_stamps(repo_dir, q)
check_cross_run_contamination(repo_dir, q, version_arg, skill_version)
check_run_metadata(q)
check_v1_5_0_gate_invariants(repo_dir, q)
print("")
# --- Main ---
def main(argv=None):
_reset_counters()
if argv is None:
argv = sys.argv[1:]
repo_dirs = []
version = ""
check_all = False
strictness = "benchmark"
expect_version = False
for arg in argv:
if expect_version:
version = arg
expect_version = False
continue
if arg == "--version":
expect_version = True
elif arg == "--all":
check_all = True
elif arg == "--benchmark":
strictness = "benchmark"
elif arg == "--general":
strictness = "general"
else:
repo_dirs.append(arg)
if not version:
version = detect_skill_version([
SCRIPT_DIR / ".." / "SKILL.md",
SCRIPT_DIR / "SKILL.md",
Path("SKILL.md"),
Path(".claude") / "skills" / "quality-playbook" / "SKILL.md",
Path(".github") / "skills" / "SKILL.md",
Path(".github") / "skills" / "quality-playbook" / "SKILL.md",
])
# Resolve repos
if check_all:
for entry in sorted(SCRIPT_DIR.glob(f"*-{version}")):
if (entry / "quality").is_dir():
repo_dirs.append(str(entry))
elif len(repo_dirs) == 1 and repo_dirs[0] == ".":
repo_dirs = [str(Path.cwd())]
else:
resolved = []
for name in repo_dirs:
p = Path(name)
if (p / "quality").is_dir():
resolved.append(name)
elif (SCRIPT_DIR / f"{name}-{version}").is_dir():
resolved.append(str(SCRIPT_DIR / f"{name}-{version}"))
elif (SCRIPT_DIR / name).is_dir():
resolved.append(str(SCRIPT_DIR / name))
else:
print(f"WARNING: Cannot find repo '{name}'")
repo_dirs = resolved
if not repo_dirs:
print(f"Usage: {sys.argv[0]} [--version V] [--all | repo1 repo2 ... | .]")
return 1
print("=== Quality Gate — Post-Run Validation ===")
print(f"Version: {version or 'unknown'}")
print(f"Strictness: {strictness}")
print(f"Repos: {len(repo_dirs)}")
for rd in repo_dirs:
check_repo(rd, version, strictness)
print("")
print("===========================================")
print(f"Total: {FAIL} FAIL, {WARN} WARN")
if FAIL > 0:
print(f"RESULT: GATE FAILED — {FAIL} check(s) must be fixed")
return 1
else:
print("RESULT: GATE PASSED")
return 0
if __name__ == "__main__":
sys.exit(main())