mirror of
https://github.com/github/awesome-copilot.git
synced 2026-04-30 20:25:55 +00:00
update eval-driven-dev skill (#1434)
* update eval-driven-dev skill * fix: update skill update command to use correct repository path * address comments. * update eval driven dev
This commit is contained in:
139
skills/eval-driven-dev/resources/verify_step6_completion.py
Normal file
139
skills/eval-driven-dev/resources/verify_step6_completion.py
Normal file
@@ -0,0 +1,139 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Validate that eval-driven-dev Step 6 artifacts are complete.
|
||||
|
||||
Usage:
|
||||
python verify_step6_completion.py /path/to/pixie_qa/results/<test_id>
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
ENTRY_REQUIRED_FILES = ("evaluations.jsonl",)
|
||||
DATASET_ANALYSIS_FILES = ("analysis.md", "analysis-summary.md")
|
||||
ROOT_ANALYSIS_FILES = ("action-plan.md", "action-plan-summary.md", "meta.json")
|
||||
|
||||
|
||||
def _dataset_dirs(results_dir: Path) -> list[Path]:
|
||||
return sorted(
|
||||
path
|
||||
for path in results_dir.iterdir()
|
||||
if path.is_dir() and path.name.startswith("dataset-")
|
||||
)
|
||||
|
||||
|
||||
def _entry_dirs(dataset_dir: Path) -> list[Path]:
|
||||
return sorted(
|
||||
path
|
||||
for path in dataset_dir.iterdir()
|
||||
if path.is_dir() and path.name.startswith("entry-")
|
||||
)
|
||||
|
||||
|
||||
def _read_jsonl(path: Path, errors: list[str]) -> list[dict[str, object]]:
|
||||
rows: list[dict[str, object]] = []
|
||||
try:
|
||||
for index, line in enumerate(
|
||||
path.read_text(encoding="utf-8").splitlines(), start=1
|
||||
):
|
||||
if not line.strip():
|
||||
continue
|
||||
obj = json.loads(line)
|
||||
if not isinstance(obj, dict):
|
||||
errors.append(f"{path}: line {index} is not a JSON object")
|
||||
continue
|
||||
rows.append(obj)
|
||||
except OSError as exc:
|
||||
errors.append(f"{path}: could not read file ({exc})")
|
||||
except json.JSONDecodeError as exc:
|
||||
errors.append(f"{path}: invalid JSONL ({exc})")
|
||||
return rows
|
||||
|
||||
|
||||
def validate_results_dir(results_dir: Path) -> list[str]:
|
||||
"""Return a list of validation errors for a pixie results directory."""
|
||||
errors: list[str] = []
|
||||
|
||||
if not results_dir.is_dir():
|
||||
return [f"{results_dir}: results directory not found"]
|
||||
|
||||
for file_name in ROOT_ANALYSIS_FILES:
|
||||
if not (results_dir / file_name).is_file():
|
||||
errors.append(f"Missing root artifact: {results_dir / file_name}")
|
||||
|
||||
datasets = _dataset_dirs(results_dir)
|
||||
if not datasets:
|
||||
errors.append(f"{results_dir}: no dataset-* directories found")
|
||||
return errors
|
||||
|
||||
for dataset_dir in datasets:
|
||||
for file_name in DATASET_ANALYSIS_FILES:
|
||||
if not (dataset_dir / file_name).is_file():
|
||||
errors.append(f"Missing dataset artifact: {dataset_dir / file_name}")
|
||||
|
||||
entry_dirs = _entry_dirs(dataset_dir)
|
||||
if not entry_dirs:
|
||||
errors.append(f"{dataset_dir}: no entry-* directories found")
|
||||
continue
|
||||
|
||||
for entry_dir in entry_dirs:
|
||||
for file_name in ENTRY_REQUIRED_FILES:
|
||||
if not (entry_dir / file_name).is_file():
|
||||
errors.append(f"Missing entry artifact: {entry_dir / file_name}")
|
||||
|
||||
evaluations_path = entry_dir / "evaluations.jsonl"
|
||||
if not evaluations_path.is_file():
|
||||
continue
|
||||
|
||||
evaluations = _read_jsonl(evaluations_path, errors)
|
||||
for row in evaluations:
|
||||
status = row.get("status")
|
||||
if status == "pending":
|
||||
errors.append(
|
||||
"Pending evaluation remains: "
|
||||
f"{evaluations_path} ({row.get('evaluator', 'unknown evaluator')})"
|
||||
)
|
||||
continue
|
||||
|
||||
if "score" not in row:
|
||||
errors.append(
|
||||
"Missing score in scored evaluation: "
|
||||
f"{evaluations_path} ({row.get('evaluator', 'unknown evaluator')})"
|
||||
)
|
||||
if "reasoning" not in row:
|
||||
errors.append(
|
||||
"Missing reasoning in scored evaluation: "
|
||||
f"{evaluations_path} ({row.get('evaluator', 'unknown evaluator')})"
|
||||
)
|
||||
|
||||
return errors
|
||||
|
||||
|
||||
def main(argv: list[str] | None = None) -> int:
|
||||
"""CLI entry point."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Validate Step 6 completion for a pixie results directory"
|
||||
)
|
||||
parser.add_argument(
|
||||
"results_dir",
|
||||
type=Path,
|
||||
help="Path to pixie_qa/results/<test_id>",
|
||||
)
|
||||
args = parser.parse_args(argv)
|
||||
|
||||
errors = validate_results_dir(args.results_dir)
|
||||
if errors:
|
||||
print("Step 6 completion check failed:")
|
||||
for error in errors:
|
||||
print(f"- {error}")
|
||||
return 1
|
||||
|
||||
print("Step 6 completion check passed.")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
Reference in New Issue
Block a user