update eval-driven-dev skill (#1434)

* update eval-driven-dev skill * fix: update skill update command to use correct repository path * address comments. * update eval driven dev
2026-04-30 12:15:56 +00:00 · 2026-04-27 18:27:48 -07:00
parent 9933f65e6b
commit 2860790bc9
23 changed files with 1881 additions and 700 deletions
--- a/skills/eval-driven-dev/resources/setup.sh
+++ b/skills/eval-driven-dev/resources/setup.sh
@@ -2,21 +2,74 @@
 # Setup script for eval-driven-dev skill.
 # Updates the skill, installs/upgrades pixie-qa[all], initializes the
 # pixie working directory, and starts the web UI server in the background.
-# Failures are non-fatal — the workflow continues even if a step here is
-# blocked by the environment.
+#
+# Error handling:
+#   - Skill update failure → non-fatal (continue with existing version)
+#   - pixie-qa upgrade failure when already installed → non-fatal
+#   - pixie-qa NOT installed and install fails → FATAL (exit 1)
+#   - pixie init failure → FATAL (exit 1)
+#   - pixie start failure → FATAL (exit 1)
 set -u

 echo "=== Updating skill ==="
-npx skills update || echo "(skill update skipped)"
+npx skills update github/awesome-copilot --skill eval-driven-dev -g -y && npx skills update github/awesome-copilot --skill eval-driven-dev -p -y || {
+  echo "(skill update failed — proceeding with existing version)"
+}

 echo ""
 echo "=== Installing / upgrading pixie-qa[all] ==="
+
+# Helper: check if pixie CLI is importable
+_pixie_available() {
+  if [ -f uv.lock ]; then
+    uv run python -c "import pixie" 2>/dev/null
+  elif [ -f poetry.lock ]; then
+    poetry run python -c "import pixie" 2>/dev/null
+  else
+    python -c "import pixie" 2>/dev/null
+  fi
+}
+
+# Check if pixie is already installed before attempting upgrade
+PIXIE_WAS_INSTALLED=false
+if _pixie_available; then
+  PIXIE_WAS_INSTALLED=true
+fi
+
+INSTALL_OK=false
 if [ -f uv.lock ]; then
-  uv add "pixie-qa[all]>=0.6.1,<0.7.0" --upgrade
+  # uv add does universal resolution across all Python versions in
+  # requires-python.  If the host project supports a Python version
+  # where pixie-qa is unavailable (e.g. <3.10), uv add fails.
+  # Fall back to uv pip install which only targets the active interpreter.
+  if uv add "pixie-qa[all]>=0.8.4,<0.9.0" --upgrade 2>&1; then
+    INSTALL_OK=true
+  else
+    echo "(uv add failed — falling back to uv pip install)"
+    if uv pip install "pixie-qa[all]>=0.8.4,<0.9.0" 2>&1; then
+      INSTALL_OK=true
+    fi
+  fi
 elif [ -f poetry.lock ]; then
-  poetry add "pixie-qa[all]>=0.6.1,<0.7.0"
+  if poetry add "pixie-qa[all]>=0.8.4,<0.9.0"; then
+    INSTALL_OK=true
+  fi
 else
-  pip install --upgrade "pixie-qa[all]>=0.6.1,<0.7.0"
+  if pip install --upgrade "pixie-qa[all]>=0.8.4,<0.9.0"; then
+    INSTALL_OK=true
+  fi
+fi
+
+if [ "$INSTALL_OK" = false ]; then
+  if [ "$PIXIE_WAS_INSTALLED" = true ]; then
+    echo "(pixie-qa upgrade failed — proceeding with existing version)"
+  else
+    echo ""
+    echo "ERROR: pixie-qa is not installed and installation failed."
+    echo "The eval-driven-dev workflow requires the pixie-qa package."
+    echo "Please install it manually and re-run this script."
+    exit 1
+  fi
 fi

 echo ""
@@ -29,6 +82,13 @@ else
  pixie init
 fi

+if [ $? -ne 0 ]; then
+  echo ""
+  echo "ERROR: Failed to initialize pixie working directory."
+  echo "Please check the error above and fix it before continuing."
+  exit 1
+fi
+
 echo ""
 echo "=== Starting web UI server (background) ==="
 if [ -f uv.lock ]; then
@@ -39,5 +99,12 @@ else
  pixie start
 fi

+if [ $? -ne 0 ]; then
+  echo ""
+  echo "ERROR: Failed to start the web UI server."
+  echo "Please check the error above and fix it before continuing."
+  exit 1
+fi
+
 echo ""
 echo "=== Setup complete ==="
--- a/skills/eval-driven-dev/resources/verify_step6_completion.py
+++ b/skills/eval-driven-dev/resources/verify_step6_completion.py
@@ -0,0 +1,139 @@
+#!/usr/bin/env python3
+"""Validate that eval-driven-dev Step 6 artifacts are complete.
+
+Usage:
+    python verify_step6_completion.py /path/to/pixie_qa/results/<test_id>
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from pathlib import Path
+
+ENTRY_REQUIRED_FILES = ("evaluations.jsonl",)
+DATASET_ANALYSIS_FILES = ("analysis.md", "analysis-summary.md")
+ROOT_ANALYSIS_FILES = ("action-plan.md", "action-plan-summary.md", "meta.json")
+
+
+def _dataset_dirs(results_dir: Path) -> list[Path]:
+    return sorted(
+        path
+        for path in results_dir.iterdir()
+        if path.is_dir() and path.name.startswith("dataset-")
+    )
+
+
+def _entry_dirs(dataset_dir: Path) -> list[Path]:
+    return sorted(
+        path
+        for path in dataset_dir.iterdir()
+        if path.is_dir() and path.name.startswith("entry-")
+    )
+
+
+def _read_jsonl(path: Path, errors: list[str]) -> list[dict[str, object]]:
+    rows: list[dict[str, object]] = []
+    try:
+        for index, line in enumerate(
+            path.read_text(encoding="utf-8").splitlines(), start=1
+        ):
+            if not line.strip():
+                continue
+            obj = json.loads(line)
+            if not isinstance(obj, dict):
+                errors.append(f"{path}: line {index} is not a JSON object")
+                continue
+            rows.append(obj)
+    except OSError as exc:
+        errors.append(f"{path}: could not read file ({exc})")
+    except json.JSONDecodeError as exc:
+        errors.append(f"{path}: invalid JSONL ({exc})")
+    return rows
+
+
+def validate_results_dir(results_dir: Path) -> list[str]:
+    """Return a list of validation errors for a pixie results directory."""
+    errors: list[str] = []
+
+    if not results_dir.is_dir():
+        return [f"{results_dir}: results directory not found"]
+
+    for file_name in ROOT_ANALYSIS_FILES:
+        if not (results_dir / file_name).is_file():
+            errors.append(f"Missing root artifact: {results_dir / file_name}")
+
+    datasets = _dataset_dirs(results_dir)
+    if not datasets:
+        errors.append(f"{results_dir}: no dataset-* directories found")
+        return errors
+
+    for dataset_dir in datasets:
+        for file_name in DATASET_ANALYSIS_FILES:
+            if not (dataset_dir / file_name).is_file():
+                errors.append(f"Missing dataset artifact: {dataset_dir / file_name}")
+
+        entry_dirs = _entry_dirs(dataset_dir)
+        if not entry_dirs:
+            errors.append(f"{dataset_dir}: no entry-* directories found")
+            continue
+
+        for entry_dir in entry_dirs:
+            for file_name in ENTRY_REQUIRED_FILES:
+                if not (entry_dir / file_name).is_file():
+                    errors.append(f"Missing entry artifact: {entry_dir / file_name}")
+
+            evaluations_path = entry_dir / "evaluations.jsonl"
+            if not evaluations_path.is_file():
+                continue
+
+            evaluations = _read_jsonl(evaluations_path, errors)
+            for row in evaluations:
+                status = row.get("status")
+                if status == "pending":
+                    errors.append(
+                        "Pending evaluation remains: "
+                        f"{evaluations_path} ({row.get('evaluator', 'unknown evaluator')})"
+                    )
+                    continue
+
+                if "score" not in row:
+                    errors.append(
+                        "Missing score in scored evaluation: "
+                        f"{evaluations_path} ({row.get('evaluator', 'unknown evaluator')})"
+                    )
+                if "reasoning" not in row:
+                    errors.append(
+                        "Missing reasoning in scored evaluation: "
+                        f"{evaluations_path} ({row.get('evaluator', 'unknown evaluator')})"
+                    )
+
+    return errors
+
+
+def main(argv: list[str] | None = None) -> int:
+    """CLI entry point."""
+    parser = argparse.ArgumentParser(
+        description="Validate Step 6 completion for a pixie results directory"
+    )
+    parser.add_argument(
+        "results_dir",
+        type=Path,
+        help="Path to pixie_qa/results/<test_id>",
+    )
+    args = parser.parse_args(argv)
+
+    errors = validate_results_dir(args.results_dir)
+    if errors:
+        print("Step 6 completion check failed:")
+        for error in errors:
+            print(f"- {error}")
+        return 1
+
+    print("Step 6 completion check passed.")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())