mirror of
https://github.com/github/awesome-copilot.git
synced 2026-05-15 11:11:48 +00:00
chore: sync Arize skills from arize-skills@6a622b6c962907f54ca3578cb2cabff161d8aae6 and phoenix@30ccbe6b38cc83719038bf30041335f29bae45e9 (#1690)
Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
This commit is contained in:
@@ -4,6 +4,8 @@ Creating and managing evaluation datasets.
|
||||
|
||||
## Creating Datasets
|
||||
|
||||
`create_dataset()` upserts: if a dataset with the same name already exists it is updated in-place; re-running with identical inputs is a no-op.
|
||||
|
||||
```python
|
||||
from phoenix.client import Client
|
||||
|
||||
@@ -21,6 +23,19 @@ dataset = client.datasets.create_dataset(
|
||||
],
|
||||
)
|
||||
|
||||
# With stable example IDs for targeted updates across uploads
|
||||
dataset = client.datasets.create_dataset(
|
||||
name="qa-test-v1",
|
||||
examples=[
|
||||
{
|
||||
"id": "q-001", # stable ID — server updates this row, not inserts
|
||||
"input": {"question": "What is 2+2?"},
|
||||
"output": {"answer": "4"},
|
||||
"metadata": {"category": "math"},
|
||||
},
|
||||
],
|
||||
)
|
||||
|
||||
# From DataFrame
|
||||
dataset = client.datasets.create_dataset(
|
||||
dataframe=df,
|
||||
@@ -28,6 +43,8 @@ dataset = client.datasets.create_dataset(
|
||||
input_keys=["question"],
|
||||
output_keys=["answer"],
|
||||
metadata_keys=["category"],
|
||||
split_key="split", # single split column (use this instead of deprecated split_keys)
|
||||
example_id_key="id", # column containing stable example IDs
|
||||
)
|
||||
```
|
||||
|
||||
@@ -58,6 +75,9 @@ df = dataset.to_dataframe()
|
||||
| `input_keys` | Columns for task input |
|
||||
| `output_keys` | Columns for expected output |
|
||||
| `metadata_keys` | Additional context |
|
||||
| `example_id_key` | Column with stable example IDs; server updates the matching row instead of inserting |
|
||||
| `split_key` | Single column for split assignment (replaces deprecated `split_keys`) |
|
||||
| `split_keys` | **Deprecated** — use `split_key` (singular) instead |
|
||||
|
||||
## Using Evaluators in Experiments
|
||||
|
||||
@@ -128,6 +148,8 @@ See `tutorials/evals/evals-2/evals_2.0_rag_demo.ipynb` for a full worked example
|
||||
|
||||
## Best Practices
|
||||
|
||||
- **Versioning**: Create new datasets (e.g., `qa-test-v2`), don't modify
|
||||
- **Upsert by default**: Re-upload to the same name to update in-place; use `example_id_key` so the server targets specific rows instead of treating every upload as new data
|
||||
- **Versioning**: Version with tags or new names (e.g., `qa-test-v2`) when you want a clean snapshot, not just incremental edits
|
||||
- **Metadata**: Track source, category, difficulty
|
||||
- **Balance**: Ensure diverse coverage across categories
|
||||
- **Avoid `split_keys`**: Pass `split_key` (singular) — `split_keys` is deprecated and emits a `DeprecationWarning`
|
||||
|
||||
@@ -4,6 +4,8 @@ Creating and managing evaluation datasets.
|
||||
|
||||
## Creating Datasets
|
||||
|
||||
`createDataset()` upserts: if a dataset with the same name already exists it is updated to match the provided examples. Re-running with identical inputs is a no-op.
|
||||
|
||||
```typescript
|
||||
import { createClient } from "@arizeai/phoenix-client";
|
||||
import { createDataset } from "@arizeai/phoenix-client/datasets";
|
||||
@@ -21,15 +23,32 @@ const { datasetId } = await createDataset({
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
// With stable example IDs for targeted updates across uploads
|
||||
const { datasetId } = await createDataset({
|
||||
client,
|
||||
name: "qa-test-v1",
|
||||
examples: [
|
||||
{
|
||||
id: "q-001", // stable ID — server updates this row, not inserts
|
||||
input: { question: "What is 2+2?" },
|
||||
output: { answer: "4" },
|
||||
metadata: { category: "math" },
|
||||
},
|
||||
],
|
||||
});
|
||||
```
|
||||
|
||||
## Example Structure
|
||||
|
||||
```typescript
|
||||
interface DatasetExample {
|
||||
interface Example {
|
||||
input: Record<string, unknown>; // Task input
|
||||
output?: Record<string, unknown>; // Expected output
|
||||
metadata?: Record<string, unknown>; // Additional context
|
||||
output?: Record<string, unknown> | null; // Expected output
|
||||
metadata?: Record<string, unknown> | null; // Additional context
|
||||
splits?: string | string[] | null; // Split assignment ("train", ["train", "easy"], etc.)
|
||||
spanId?: string | null; // OTEL span ID to link back to source trace
|
||||
id?: string | null; // Stable user-provided ID; server updates matching row
|
||||
}
|
||||
```
|
||||
|
||||
@@ -64,6 +83,7 @@ const all = await listDatasets({ client });
|
||||
|
||||
## Best Practices
|
||||
|
||||
- **Versioning**: Create new datasets, don't modify existing
|
||||
- **Upsert by default**: Re-upload to the same name to update in-place; use `id` on examples so the server targets specific rows instead of treating every upload as new data
|
||||
- **Versioning**: Version with new names (e.g., `qa-test-v2`) when you want a clean snapshot, not just incremental edits
|
||||
- **Metadata**: Track source, category, provenance
|
||||
- **Type safety**: Use TypeScript interfaces for structure
|
||||
- **Type safety**: Use the `Example` type from `@arizeai/phoenix-client/datasets`
|
||||
|
||||
Reference in New Issue
Block a user