mirror of
https://github.com/github/awesome-copilot.git
synced 2026-05-04 22:25:57 +00:00
chore: sync Arize skills from arize-skills@597d609bfe5f07fd7d24acfdb408a082911b18fc and phoenix@746247cbb07b0dc7803b87c69dd8c77811c33f59 (#1583)
Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
This commit is contained in:
@@ -81,11 +81,25 @@ relevance = ClassificationEvaluator(
|
||||
## Pre-Built
|
||||
|
||||
```python
|
||||
from phoenix.experiments.evaluators import ContainsAnyKeyword, JSONParseable, MatchesRegex
|
||||
from phoenix.client.experiments import create_evaluator
|
||||
from phoenix.evals.metrics import MatchesRegex
|
||||
|
||||
evaluators = [
|
||||
ContainsAnyKeyword(keywords=["disclaimer"]),
|
||||
JSONParseable(),
|
||||
MatchesRegex(pattern=r"\d{4}-\d{2}-\d{2}"),
|
||||
]
|
||||
date_format = MatchesRegex(pattern=r"\d{4}-\d{2}-\d{2}")
|
||||
|
||||
|
||||
@create_evaluator(name="contains_any_keyword", kind="code")
|
||||
def contains_any_keyword(output, expected):
|
||||
keywords = expected.get("keywords", [])
|
||||
return any(kw.lower() in str(output).lower() for kw in keywords)
|
||||
|
||||
|
||||
@create_evaluator(name="json_parseable", kind="code")
|
||||
def json_parseable(output):
|
||||
import json
|
||||
|
||||
try:
|
||||
json.loads(output)
|
||||
return True
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
return False
|
||||
```
|
||||
|
||||
@@ -14,9 +14,10 @@ EXPERIMENT → Run task on all examples, score results
|
||||
## Basic Usage
|
||||
|
||||
```python
|
||||
from phoenix.client.experiments import run_experiment
|
||||
from phoenix.client import Client
|
||||
|
||||
experiment = run_experiment(
|
||||
client = Client()
|
||||
experiment = client.experiments.run_experiment(
|
||||
dataset=my_dataset,
|
||||
task=my_task,
|
||||
evaluators=[accuracy, faithfulness],
|
||||
@@ -40,7 +41,28 @@ print(experiment.aggregate_scores)
|
||||
Test setup before full execution:
|
||||
|
||||
```python
|
||||
experiment = run_experiment(dataset, task, evaluators, dry_run=3) # Just 3 examples
|
||||
experiment = client.experiments.run_experiment(
|
||||
dataset=dataset,
|
||||
task=task,
|
||||
evaluators=evaluators,
|
||||
dry_run=3,
|
||||
) # Just 3 examples
|
||||
```
|
||||
|
||||
## Async Usage
|
||||
|
||||
Use `AsyncClient` when your task or evaluators make network calls and you want higher throughput:
|
||||
|
||||
```python
|
||||
from phoenix.client import AsyncClient
|
||||
|
||||
client = AsyncClient()
|
||||
experiment = await client.experiments.run_experiment(
|
||||
dataset=my_dataset,
|
||||
task=my_async_task,
|
||||
evaluators=[accuracy, faithfulness],
|
||||
experiment_name="improved-retrieval-v2",
|
||||
)
|
||||
```
|
||||
|
||||
## Best Practices
|
||||
|
||||
@@ -69,6 +69,33 @@ for run in experiment.runs:
|
||||
print(run.output, run.scores)
|
||||
```
|
||||
|
||||
## Stability
|
||||
|
||||
Single-run scores are noisy when either the task or the evaluator is non-deterministic — an LLM call, tool use, streaming output, an LLM-as-judge. On a small dataset, that per-run noise can swamp the signal from a prompt change.
|
||||
|
||||
Averaging over repetitions lets the score you report reflect the prompt rather than the sampling noise:
|
||||
|
||||
```python
|
||||
run_experiment(
|
||||
# ...
|
||||
repetitions=3,
|
||||
)
|
||||
```
|
||||
|
||||
Things to consider:
|
||||
|
||||
- Reach for repetitions when the task or the evaluator is an LLM call and the dataset is small.
|
||||
- Prefer repetitions when per-example cost is low and you mostly want to settle the score; prefer growing the dataset when you also need to cover more behaviors.
|
||||
- Skip repetitions when both the task and the evaluator are deterministic (e.g. string comparison against a ground truth) — a single run is the answer.
|
||||
|
||||
Consider adding stability when:
|
||||
|
||||
- Repeat runs of the same experiment drift in ways that feel larger than the differences you're trying to measure.
|
||||
- A prompt change flips example labels in ways that don't track with how the outputs actually changed.
|
||||
- The judge's reasoning on the same output reads differently from one run to the next.
|
||||
|
||||
Repetitions are also what `repetitions=1` (default) silently relies on — don't trust a tuning decision based on a single 10-example run.
|
||||
|
||||
## Add Evaluations Later
|
||||
|
||||
```python
|
||||
|
||||
@@ -73,6 +73,33 @@ const experiment = await runExperiment({
|
||||
});
|
||||
```
|
||||
|
||||
## Stability
|
||||
|
||||
Single-run scores are noisy when either the task or the evaluator is non-deterministic — an LLM call, tool use, streaming output, an LLM-as-judge. On a small dataset, that per-run noise can swamp the signal from a prompt change.
|
||||
|
||||
Averaging over repetitions lets the score you report reflect the prompt rather than the sampling noise:
|
||||
|
||||
```typescript
|
||||
await runExperiment({
|
||||
// ...
|
||||
repetitions: 3,
|
||||
});
|
||||
```
|
||||
|
||||
Things to consider:
|
||||
|
||||
- Reach for repetitions when the task or the evaluator is an LLM call and the dataset is small.
|
||||
- Prefer repetitions when per-example cost is low and you mostly want to settle the score; prefer growing the dataset when you also need to cover more behaviors.
|
||||
- Skip repetitions when both the task and the evaluator are deterministic (e.g. string comparison against a ground truth) — a single run is the answer.
|
||||
|
||||
Consider adding stability when:
|
||||
|
||||
- Repeat runs of the same experiment drift in ways that feel larger than the differences you're trying to measure.
|
||||
- A prompt change flips example labels in ways that don't track with how the outputs actually changed.
|
||||
- The judge's reasoning on the same output reads differently from one run to the next.
|
||||
|
||||
Repetitions are also what `repetitions: 1` (default) silently relies on — don't trust a tuning decision based on a single 10-example run.
|
||||
|
||||
## Add Evaluations Later
|
||||
|
||||
```typescript
|
||||
|
||||
@@ -11,12 +11,16 @@ Common mistakes and fixes.
|
||||
| Saturation blindness | 100% pass = no signal | Keep capability evals at 50-80% |
|
||||
| Similarity metrics | BERTScore/ROUGE for generation | Use for retrieval only |
|
||||
| Model switching | Hoping a model works better | Error analysis first |
|
||||
| Single-run scoring | LLM judges and non-deterministic tasks add per-run noise that can drown the signal from a prompt change on a small dataset | Set `repetitions` on `runExperiment` (or grow the dataset) when the task or judge is an LLM call |
|
||||
|
||||
## Quantify Changes
|
||||
|
||||
```python
|
||||
baseline = run_experiment(dataset, old_prompt, evaluators)
|
||||
improved = run_experiment(dataset, new_prompt, evaluators)
|
||||
from phoenix.client import Client
|
||||
|
||||
client = Client()
|
||||
baseline = client.experiments.run_experiment(dataset=dataset, task=old_prompt, evaluators=evaluators)
|
||||
improved = client.experiments.run_experiment(dataset=dataset, task=new_prompt, evaluators=evaluators)
|
||||
print(f"Improvement: {improved.pass_rate - baseline.pass_rate:+.1%}")
|
||||
```
|
||||
|
||||
|
||||
@@ -41,9 +41,17 @@ judge_cheap = ClassificationEvaluator(
|
||||
## Don't Model Shop
|
||||
|
||||
```python
|
||||
from phoenix.client import Client
|
||||
|
||||
client = Client()
|
||||
|
||||
# BAD
|
||||
for model in ["gpt-4o", "claude-3", "gemini-pro"]:
|
||||
results = run_experiment(dataset, task, model)
|
||||
results = client.experiments.run_experiment(
|
||||
dataset=dataset,
|
||||
task=lambda input, _model=model: task(input, model=_model),
|
||||
evaluators=evaluators,
|
||||
)
|
||||
|
||||
# GOOD
|
||||
failures = analyze_errors(results)
|
||||
|
||||
@@ -14,6 +14,10 @@ CI/CD evals vs production monitoring - complementary approaches.
|
||||
## CI/CD Evaluations
|
||||
|
||||
```python
|
||||
from phoenix.client import Client
|
||||
|
||||
client = Client()
|
||||
|
||||
# Fast, deterministic checks
|
||||
ci_evaluators = [
|
||||
has_required_format,
|
||||
@@ -23,7 +27,7 @@ ci_evaluators = [
|
||||
]
|
||||
|
||||
# Small but representative dataset (~100 examples)
|
||||
run_experiment(ci_dataset, task, ci_evaluators)
|
||||
client.experiments.run_experiment(dataset=ci_dataset, task=task, evaluators=ci_evaluators)
|
||||
```
|
||||
|
||||
Set thresholds: regression=0.95, safety=1.0, format=0.98.
|
||||
|
||||
Reference in New Issue
Block a user