update eval-driven-dev skill (#1434)

* update eval-driven-dev skill

* fix: update skill update command to use correct repository path

* address comments.

* update eval driven dev
This commit is contained in:
Yiou Li
2026-04-27 18:27:48 -07:00
committed by GitHub
parent 9933f65e6b
commit 2860790bc9
23 changed files with 1881 additions and 700 deletions

View File

@@ -2,21 +2,74 @@
# Setup script for eval-driven-dev skill.
# Updates the skill, installs/upgrades pixie-qa[all], initializes the
# pixie working directory, and starts the web UI server in the background.
# Failures are non-fatal — the workflow continues even if a step here is
# blocked by the environment.
#
# Error handling:
# - Skill update failure → non-fatal (continue with existing version)
# - pixie-qa upgrade failure when already installed → non-fatal
# - pixie-qa NOT installed and install fails → FATAL (exit 1)
# - pixie init failure → FATAL (exit 1)
# - pixie start failure → FATAL (exit 1)
set -u
echo "=== Updating skill ==="
npx skills update || echo "(skill update skipped)"
npx skills update github/awesome-copilot --skill eval-driven-dev -g -y && npx skills update github/awesome-copilot --skill eval-driven-dev -p -y || {
echo "(skill update failed — proceeding with existing version)"
}
echo ""
echo "=== Installing / upgrading pixie-qa[all] ==="
# Helper: check if pixie CLI is importable
_pixie_available() {
if [ -f uv.lock ]; then
uv run python -c "import pixie" 2>/dev/null
elif [ -f poetry.lock ]; then
poetry run python -c "import pixie" 2>/dev/null
else
python -c "import pixie" 2>/dev/null
fi
}
# Check if pixie is already installed before attempting upgrade
PIXIE_WAS_INSTALLED=false
if _pixie_available; then
PIXIE_WAS_INSTALLED=true
fi
INSTALL_OK=false
if [ -f uv.lock ]; then
uv add "pixie-qa[all]>=0.6.1,<0.7.0" --upgrade
# uv add does universal resolution across all Python versions in
# requires-python. If the host project supports a Python version
# where pixie-qa is unavailable (e.g. <3.10), uv add fails.
# Fall back to uv pip install which only targets the active interpreter.
if uv add "pixie-qa[all]>=0.8.4,<0.9.0" --upgrade 2>&1; then
INSTALL_OK=true
else
echo "(uv add failed — falling back to uv pip install)"
if uv pip install "pixie-qa[all]>=0.8.4,<0.9.0" 2>&1; then
INSTALL_OK=true
fi
fi
elif [ -f poetry.lock ]; then
poetry add "pixie-qa[all]>=0.6.1,<0.7.0"
if poetry add "pixie-qa[all]>=0.8.4,<0.9.0"; then
INSTALL_OK=true
fi
else
pip install --upgrade "pixie-qa[all]>=0.6.1,<0.7.0"
if pip install --upgrade "pixie-qa[all]>=0.8.4,<0.9.0"; then
INSTALL_OK=true
fi
fi
if [ "$INSTALL_OK" = false ]; then
if [ "$PIXIE_WAS_INSTALLED" = true ]; then
echo "(pixie-qa upgrade failed — proceeding with existing version)"
else
echo ""
echo "ERROR: pixie-qa is not installed and installation failed."
echo "The eval-driven-dev workflow requires the pixie-qa package."
echo "Please install it manually and re-run this script."
exit 1
fi
fi
echo ""
@@ -29,6 +82,13 @@ else
pixie init
fi
if [ $? -ne 0 ]; then
echo ""
echo "ERROR: Failed to initialize pixie working directory."
echo "Please check the error above and fix it before continuing."
exit 1
fi
echo ""
echo "=== Starting web UI server (background) ==="
if [ -f uv.lock ]; then
@@ -39,5 +99,12 @@ else
pixie start
fi
if [ $? -ne 0 ]; then
echo ""
echo "ERROR: Failed to start the web UI server."
echo "Please check the error above and fix it before continuing."
exit 1
fi
echo ""
echo "=== Setup complete ==="

View File

@@ -0,0 +1,139 @@
#!/usr/bin/env python3
"""Validate that eval-driven-dev Step 6 artifacts are complete.
Usage:
python verify_step6_completion.py /path/to/pixie_qa/results/<test_id>
"""
from __future__ import annotations
import argparse
import json
import sys
from pathlib import Path
ENTRY_REQUIRED_FILES = ("evaluations.jsonl",)
DATASET_ANALYSIS_FILES = ("analysis.md", "analysis-summary.md")
ROOT_ANALYSIS_FILES = ("action-plan.md", "action-plan-summary.md", "meta.json")
def _dataset_dirs(results_dir: Path) -> list[Path]:
return sorted(
path
for path in results_dir.iterdir()
if path.is_dir() and path.name.startswith("dataset-")
)
def _entry_dirs(dataset_dir: Path) -> list[Path]:
return sorted(
path
for path in dataset_dir.iterdir()
if path.is_dir() and path.name.startswith("entry-")
)
def _read_jsonl(path: Path, errors: list[str]) -> list[dict[str, object]]:
rows: list[dict[str, object]] = []
try:
for index, line in enumerate(
path.read_text(encoding="utf-8").splitlines(), start=1
):
if not line.strip():
continue
obj = json.loads(line)
if not isinstance(obj, dict):
errors.append(f"{path}: line {index} is not a JSON object")
continue
rows.append(obj)
except OSError as exc:
errors.append(f"{path}: could not read file ({exc})")
except json.JSONDecodeError as exc:
errors.append(f"{path}: invalid JSONL ({exc})")
return rows
def validate_results_dir(results_dir: Path) -> list[str]:
"""Return a list of validation errors for a pixie results directory."""
errors: list[str] = []
if not results_dir.is_dir():
return [f"{results_dir}: results directory not found"]
for file_name in ROOT_ANALYSIS_FILES:
if not (results_dir / file_name).is_file():
errors.append(f"Missing root artifact: {results_dir / file_name}")
datasets = _dataset_dirs(results_dir)
if not datasets:
errors.append(f"{results_dir}: no dataset-* directories found")
return errors
for dataset_dir in datasets:
for file_name in DATASET_ANALYSIS_FILES:
if not (dataset_dir / file_name).is_file():
errors.append(f"Missing dataset artifact: {dataset_dir / file_name}")
entry_dirs = _entry_dirs(dataset_dir)
if not entry_dirs:
errors.append(f"{dataset_dir}: no entry-* directories found")
continue
for entry_dir in entry_dirs:
for file_name in ENTRY_REQUIRED_FILES:
if not (entry_dir / file_name).is_file():
errors.append(f"Missing entry artifact: {entry_dir / file_name}")
evaluations_path = entry_dir / "evaluations.jsonl"
if not evaluations_path.is_file():
continue
evaluations = _read_jsonl(evaluations_path, errors)
for row in evaluations:
status = row.get("status")
if status == "pending":
errors.append(
"Pending evaluation remains: "
f"{evaluations_path} ({row.get('evaluator', 'unknown evaluator')})"
)
continue
if "score" not in row:
errors.append(
"Missing score in scored evaluation: "
f"{evaluations_path} ({row.get('evaluator', 'unknown evaluator')})"
)
if "reasoning" not in row:
errors.append(
"Missing reasoning in scored evaluation: "
f"{evaluations_path} ({row.get('evaluator', 'unknown evaluator')})"
)
return errors
def main(argv: list[str] | None = None) -> int:
"""CLI entry point."""
parser = argparse.ArgumentParser(
description="Validate Step 6 completion for a pixie results directory"
)
parser.add_argument(
"results_dir",
type=Path,
help="Path to pixie_qa/results/<test_id>",
)
args = parser.parse_args(argv)
errors = validate_results_dir(args.results_dir)
if errors:
print("Step 6 completion check failed:")
for error in errors:
print(f"- {error}")
return 1
print("Step 6 completion check passed.")
return 0
if __name__ == "__main__":
sys.exit(main())