mirror of
https://github.com/github/awesome-copilot.git
synced 2026-04-30 12:15:56 +00:00
117 lines
4.1 KiB
Markdown
117 lines
4.1 KiB
Markdown
# Error Recovery Hooks
|
|
|
|
Keep the LLM investigating when tools fail instead of giving up with a partial result.
|
|
|
|
## Problem
|
|
|
|
When a shell command returns an error or a file operation hits a permission denial, the LLM tends to stop and apologize rather than trying a different approach. This produces incomplete results in agentic workflows where resilience matters.
|
|
|
|
## Solution
|
|
|
|
Use the SDK's hooks system (`on_post_tool_use`, `on_error_occurred`) to classify tool results by category and append continuation instructions that nudge the LLM to keep going.
|
|
|
|
```python
|
|
from enum import Enum
|
|
|
|
|
|
class ToolResultCategory(str, Enum):
|
|
SHELL_ERROR = "shell_error"
|
|
PERMISSION_DENIED = "permission_denied"
|
|
NORMAL = "normal"
|
|
|
|
|
|
class SDKErrorCategory(str, Enum):
|
|
CLIENT_ERROR = "client_error" # 4xx — not retryable
|
|
TRANSIENT = "transient" # 5xx / timeout
|
|
NON_RECOVERABLE = "non_recoverable"
|
|
|
|
|
|
# Phrases that signal permission issues in tool output
|
|
PERMISSION_DENIAL_PHRASES = [
|
|
"permission denied",
|
|
"access denied",
|
|
"not permitted",
|
|
"operation not allowed",
|
|
"eacces",
|
|
"eperm",
|
|
"403 forbidden",
|
|
]
|
|
|
|
SHELL_ERROR_PHRASES = [
|
|
"command not found",
|
|
"no such file or directory",
|
|
"exit code",
|
|
"errno",
|
|
"traceback",
|
|
]
|
|
|
|
CONTINUATION_MESSAGES = {
|
|
ToolResultCategory.SHELL_ERROR: (
|
|
"\n\n[SYSTEM NOTE: This command encountered an error. "
|
|
"This does NOT mean you should stop. Retry with different "
|
|
"arguments, try a different tool, or move on.]"
|
|
),
|
|
ToolResultCategory.PERMISSION_DENIED: (
|
|
"\n\n[SYSTEM NOTE: Permission was denied for this specific "
|
|
"action. Continue using alternative approaches.]"
|
|
),
|
|
}
|
|
|
|
|
|
def classify_tool_result(tool_name: str, result_text: str) -> ToolResultCategory:
|
|
result_lower = result_text.lower()
|
|
if any(phrase in result_lower for phrase in PERMISSION_DENIAL_PHRASES):
|
|
return ToolResultCategory.PERMISSION_DENIED
|
|
if any(phrase in result_lower for phrase in SHELL_ERROR_PHRASES):
|
|
return ToolResultCategory.SHELL_ERROR
|
|
return ToolResultCategory.NORMAL
|
|
|
|
|
|
def classify_sdk_error(error_msg: str, recoverable: bool) -> SDKErrorCategory:
|
|
error_lower = error_msg.lower()
|
|
if any(kw in error_lower for kw in ("timeout", "503", "502", "429", "retry")):
|
|
return SDKErrorCategory.TRANSIENT
|
|
if any(kw in error_lower for kw in ("401", "403", "404", "400", "422")):
|
|
return SDKErrorCategory.CLIENT_ERROR
|
|
return SDKErrorCategory.TRANSIENT if recoverable else SDKErrorCategory.NON_RECOVERABLE
|
|
```
|
|
|
|
## Hook Registration
|
|
|
|
Wire the classifiers into the SDK's hook system:
|
|
|
|
```python
|
|
def on_post_tool_use(input_data, env):
|
|
"""Append continuation hints to failed tool results."""
|
|
tool_name = input_data.get("toolName", "")
|
|
result = str(input_data.get("toolResult", ""))
|
|
category = classify_tool_result(tool_name, result)
|
|
if category in CONTINUATION_MESSAGES:
|
|
return {"toolResult": result + CONTINUATION_MESSAGES[category]}
|
|
return None
|
|
|
|
|
|
def on_error_occurred(input_data, env):
|
|
"""Retry transient errors, skip non-recoverable ones gracefully."""
|
|
error_msg = input_data.get("error", "")
|
|
recoverable = input_data.get("recoverable", False)
|
|
category = classify_sdk_error(error_msg, recoverable)
|
|
if category == SDKErrorCategory.TRANSIENT:
|
|
return {"errorHandling": "retry", "retryCount": 2}
|
|
return {
|
|
"errorHandling": "skip",
|
|
"userNotification": "Error occurred — continuing investigation.",
|
|
}
|
|
```
|
|
|
|
## Tips
|
|
|
|
- **Tune the phrase lists** for your domain — add patterns from your actual tool output.
|
|
- **Log classified categories** so you can track how often each failure mode fires and whether the LLM actually recovers.
|
|
- **Cap continuation depth** — if the same tool fails 3+ times in a row, let the LLM give up rather than looping.
|
|
- The `SYSTEM NOTE` framing works well because the LLM treats it as authoritative instruction rather than user commentary.
|
|
|
|
## Runnable Example
|
|
|
|
See [`recipe/error_recovery_hooks.py`](recipe/error_recovery_hooks.py) for a complete working example.
|