Files
awesome-copilot/hooks/secrets-scanner/scan-secrets.sh
Shehab Sherif 446f0d767c New hook: secrets-scanner (#1002)
* New hook: secrets-scanner

Add a secrets-scanner hook that scans files modified during a Copilot
coding agent session for leaked secrets, credentials, and sensitive data.

The hook runs on sessionEnd and inspects files in one of three scopes:
- diff: only files changed in the current session (default)
- staged: only files currently staged in the git index
- all: every tracked file in the repository

Detected pattern categories:
- AWS access keys and secret keys
- GCP service account credentials
- Azure client secrets and storage connection strings
- GitHub personal access tokens
- Slack tokens (bot, user, webhook)
- Private key headers (RSA, EC, DSA, OpenSSH, PEM)
- Generic high-entropy bearer tokens
- Internal IP:port strings

Configurable via environment variables (SCAN_MODE, SCAN_SCOPE,
SECRETS_ALLOWLIST) so teams can tune for their workflow without
editing the script. Patterns are POSIX ERE (grep -E) compatible,
with no PCRE metacharacters, for portability across macOS and Linux.

Files: hooks.json, scan-secrets.sh, README.md

* refactor: move PATTERNS array to top of scan-secrets.sh for discoverability

Move the PATTERNS declaration to the top of the file so it is clearly
visible and easy to customize, as suggested in code review. Added a
descriptive header comment. No functional changes.

---------

Co-authored-by: Shehab Sherif <shehabsherif0@users.noreply.github.com>
2026-03-16 11:05:35 +11:00

273 lines
9.8 KiB
Bash
Executable File

#!/bin/bash
# Secrets Scanner Hook
# Scans files modified during a Copilot coding agent session for accidentally
# leaked secrets, credentials, and sensitive data before they are committed.
#
# Environment variables:
# SCAN_MODE - "warn" (log only) or "block" (exit non-zero on findings) (default: warn)
# SCAN_SCOPE - "diff" (changed files only) or "staged" (staged files) (default: diff)
# SKIP_SECRETS_SCAN - "true" to disable scanning entirely (default: unset)
# SECRETS_LOG_DIR - Directory for scan logs (default: logs/copilot/secrets)
# SECRETS_ALLOWLIST - Comma-separated list of patterns to ignore (default: unset)
set -euo pipefail
# ---------------------------------------------------------------------------
# Secret detection patterns (edit this list to add or remove patterns)
#
# Each entry: "PATTERN_NAME|SEVERITY|REGEX"
# Severity levels: critical, high, medium
# ---------------------------------------------------------------------------
PATTERNS=(
# Cloud provider credentials
"AWS_ACCESS_KEY|critical|AKIA[0-9A-Z]{16}"
"AWS_SECRET_KEY|critical|aws_secret_access_key[[:space:]]*[:=][[:space:]]*['\"]?[A-Za-z0-9/+=]{40}"
"GCP_SERVICE_ACCOUNT|critical|\"type\"[[:space:]]*:[[:space:]]*\"service_account\""
"GCP_API_KEY|high|AIza[0-9A-Za-z_-]{35}"
"AZURE_CLIENT_SECRET|critical|azure[_-]?client[_-]?secret[[:space:]]*[:=][[:space:]]*['\"]?[A-Za-z0-9_~.-]{34,}"
# GitHub tokens
"GITHUB_PAT|critical|ghp_[0-9A-Za-z]{36}"
"GITHUB_OAUTH|critical|gho_[0-9A-Za-z]{36}"
"GITHUB_APP_TOKEN|critical|ghs_[0-9A-Za-z]{36}"
"GITHUB_REFRESH_TOKEN|critical|ghr_[0-9A-Za-z]{36}"
"GITHUB_FINE_GRAINED_PAT|critical|github_pat_[0-9A-Za-z_]{82}"
# Private keys
"PRIVATE_KEY|critical|-----BEGIN (RSA |EC |OPENSSH |DSA |PGP )?PRIVATE KEY-----"
"PGP_PRIVATE_BLOCK|critical|-----BEGIN PGP PRIVATE KEY BLOCK-----"
# Generic secrets and tokens
"GENERIC_SECRET|high|(secret|token|password|passwd|pwd|api[_-]?key|apikey|access[_-]?key|auth[_-]?token|client[_-]?secret)[[:space:]]*[:=][[:space:]]*['\"]?[A-Za-z0-9_/+=~.-]{8,}"
"CONNECTION_STRING|high|(mongodb(\\+srv)?|postgres(ql)?|mysql|redis|amqp|mssql)://[^[:space:]'\"]{10,}"
"BEARER_TOKEN|medium|[Bb]earer[[:space:]]+[A-Za-z0-9_-]{20,}\.[A-Za-z0-9_-]{20,}"
# Messaging and SaaS tokens
"SLACK_TOKEN|high|xox[baprs]-[0-9]{10,}-[0-9A-Za-z-]+"
"SLACK_WEBHOOK|high|https://hooks\.slack\.com/services/T[0-9A-Z]{8,}/B[0-9A-Z]{8,}/[0-9A-Za-z]{24}"
"DISCORD_TOKEN|high|[MN][A-Za-z0-9]{23,}\.[A-Za-z0-9_-]{6}\.[A-Za-z0-9_-]{27,}"
"TWILIO_API_KEY|high|SK[0-9a-fA-F]{32}"
"SENDGRID_API_KEY|high|SG\.[0-9A-Za-z_-]{22}\.[0-9A-Za-z_-]{43}"
"STRIPE_SECRET_KEY|critical|sk_live_[0-9A-Za-z]{24,}"
"STRIPE_RESTRICTED_KEY|high|rk_live_[0-9A-Za-z]{24,}"
# npm tokens
"NPM_TOKEN|high|npm_[0-9A-Za-z]{36}"
# JWT (long, structured tokens)
"JWT_TOKEN|medium|eyJ[A-Za-z0-9_-]{10,}\.eyJ[A-Za-z0-9_-]{10,}\.[A-Za-z0-9_-]{10,}"
# IP addresses with ports (possible internal services)
"INTERNAL_IP_PORT|medium|(^|[^.0-9])(10\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}|172\.(1[6-9]|2[0-9]|3[01])\.[0-9]{1,3}\.[0-9]{1,3}|192\.168\.[0-9]{1,3}\.[0-9]{1,3}):[0-9]{2,5}([^0-9]|$)"
)
if [[ "${SKIP_SECRETS_SCAN:-}" == "true" ]]; then
echo "⏭️ Secrets scan skipped (SKIP_SECRETS_SCAN=true)"
exit 0
fi
# Ensure we are in a git repository
if ! git rev-parse --is-inside-work-tree &>/dev/null; then
echo "⚠️ Not in a git repository, skipping secrets scan"
exit 0
fi
MODE="${SCAN_MODE:-warn}"
SCOPE="${SCAN_SCOPE:-diff}"
LOG_DIR="${SECRETS_LOG_DIR:-logs/copilot/secrets}"
TIMESTAMP=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
FINDING_COUNT=0
mkdir -p "$LOG_DIR"
LOG_FILE="$LOG_DIR/scan.log"
# Collect files to scan based on scope
FILES=()
if [[ "$SCOPE" == "staged" ]]; then
while IFS= read -r f; do
[[ -n "$f" ]] && FILES+=("$f")
done < <(git diff --cached --name-only --diff-filter=ACMR 2>/dev/null)
else
while IFS= read -r f; do
[[ -n "$f" ]] && FILES+=("$f")
done < <(git diff --name-only --diff-filter=ACMR HEAD 2>/dev/null || git diff --name-only --diff-filter=ACMR 2>/dev/null)
# Also include untracked new files (created during the session, not yet in HEAD)
while IFS= read -r f; do
[[ -n "$f" ]] && FILES+=("$f")
done < <(git ls-files --others --exclude-standard 2>/dev/null)
fi
if [[ ${#FILES[@]} -eq 0 ]]; then
echo "✨ No modified files to scan"
printf '{"timestamp":"%s","event":"scan_complete","mode":"%s","scope":"%s","status":"clean","files_scanned":0}\n' \
"$TIMESTAMP" "$MODE" "$SCOPE" >> "$LOG_FILE"
exit 0
fi
# Parse allowlist into an array
ALLOWLIST=()
if [[ -n "${SECRETS_ALLOWLIST:-}" ]]; then
IFS=',' read -ra ALLOWLIST <<< "$SECRETS_ALLOWLIST"
fi
is_allowlisted() {
local match="$1"
for pattern in "${ALLOWLIST[@]}"; do
pattern=$(printf '%s' "$pattern" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
[[ -z "$pattern" ]] && continue
if [[ "$match" == *"$pattern"* ]]; then
return 0
fi
done
return 1
}
# Binary file detection: skip files that are not text
is_text_file() {
local filepath="$1"
[[ -f "$filepath" ]] && file --brief --mime-type "$filepath" 2>/dev/null | grep -q "^text/" && return 0
# Fallback: check common text extensions
case "$filepath" in
*.md|*.txt|*.json|*.yaml|*.yml|*.xml|*.toml|*.ini|*.cfg|*.conf|\
*.sh|*.bash|*.zsh|*.ps1|*.bat|*.cmd|\
*.py|*.rb|*.js|*.ts|*.jsx|*.tsx|*.go|*.rs|*.java|*.kt|*.cs|*.cpp|*.c|*.h|\
*.php|*.swift|*.scala|*.r|*.R|*.lua|*.pl|*.ex|*.exs|*.hs|*.ml|\
*.html|*.css|*.scss|*.less|*.svg|\
*.sql|*.graphql|*.proto|\
*.env|*.env.*|*.properties|\
Dockerfile*|Makefile*|Vagrantfile|Gemfile|Rakefile)
return 0 ;;
*)
return 1 ;;
esac
}
# Escape a string value for safe embedding in a JSON string literal
json_escape() {
printf '%s' "$1" | sed 's/\\/\\\\/g; s/"/\\"/g'
}
# Store findings as tab-separated records
FINDINGS=()
scan_file() {
local filepath="$1"
# read_path: the actual file to scan; defaults to filepath (working tree)
# When SCOPE=staged, callers pass a temp file with the staged content instead
local read_path="${2:-$1}"
# Skip if source does not exist (e.g., deleted)
[[ -f "$read_path" ]] || return 0
# Skip binary files (type detection uses the original path for MIME lookup)
if ! is_text_file "$filepath"; then
return 0
fi
# Skip common non-sensitive files
case "$filepath" in
*.lock|package-lock.json|yarn.lock|pnpm-lock.yaml|Cargo.lock|go.sum|*.sum)
return 0 ;;
esac
for entry in "${PATTERNS[@]}"; do
IFS='|' read -r pattern_name severity regex <<< "$entry"
while IFS=: read -r line_num matched_line; do
# Extract the matched fragment
local match
match=$(printf '%s\n' "$matched_line" | grep -oE "$regex" 2>/dev/null | head -1)
[[ -z "$match" ]] && continue
# Strip boundary characters from IP:port matches
if [[ "$pattern_name" == "INTERNAL_IP_PORT" ]]; then
match=$(printf '%s' "$match" | grep -oE '[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+:[0-9]+')
[[ -z "$match" ]] && continue
fi
# Check allowlist
if [[ ${#ALLOWLIST[@]} -gt 0 ]] && is_allowlisted "$match"; then
continue
fi
# Skip if this looks like a placeholder or example
if printf '%s\n' "$match" | grep -qiE '(example|placeholder|your[_-]|xxx|changeme|TODO|FIXME|replace[_-]?me|dummy|fake|test[_-]?key|sample)'; then
continue
fi
# Redact the match for safe logging: show first 4 and last 4 chars
local redacted
if [[ ${#match} -le 12 ]]; then
redacted="[REDACTED]"
else
redacted="${match:0:4}...${match: -4}"
fi
FINDINGS+=("$filepath $line_num $pattern_name $severity $redacted")
FINDING_COUNT=$((FINDING_COUNT + 1))
done < <(grep -nE "$regex" "$read_path" 2>/dev/null || true)
done
}
echo "🔍 Scanning ${#FILES[@]} modified file(s) for secrets..."
for filepath in "${FILES[@]}"; do
if [[ "$SCOPE" == "staged" ]]; then
# Scan the staged (index) version to match what will actually be committed
_tmpfile=$(mktemp)
git show :"$filepath" > "$_tmpfile" 2>/dev/null || true
scan_file "$filepath" "$_tmpfile"
rm -f "$_tmpfile"
else
scan_file "$filepath"
fi
done
# Log results
if [[ $FINDING_COUNT -gt 0 ]]; then
echo ""
echo "⚠️ Found $FINDING_COUNT potential secret(s) in modified files:"
echo ""
printf " %-40s %-6s %-28s %s\n" "FILE" "LINE" "PATTERN" "SEVERITY"
printf " %-40s %-6s %-28s %s\n" "----" "----" "-------" "--------"
# Build JSON findings array and print table
FINDINGS_JSON="["
FIRST=true
for finding in "${FINDINGS[@]}"; do
IFS=$'\t' read -r fpath fline pname psev redacted <<< "$finding"
printf " %-40s %-6s %-28s %s\n" "$fpath" "$fline" "$pname" "$psev"
if [[ "$FIRST" != "true" ]]; then
FINDINGS_JSON+=","
fi
FIRST=false
# Build JSON safely without requiring jq; escape path and match values
FINDINGS_JSON+="{\"file\":\"$(json_escape "$fpath")\",\"line\":$fline,\"pattern\":\"$pname\",\"severity\":\"$psev\",\"match\":\"$(json_escape "$redacted")\"}"
done
FINDINGS_JSON+="]"
echo ""
# Write structured log entry
printf '{"timestamp":"%s","event":"secrets_found","mode":"%s","scope":"%s","files_scanned":%d,"finding_count":%d,"findings":%s}\n' \
"$TIMESTAMP" "$MODE" "$SCOPE" "${#FILES[@]}" "$FINDING_COUNT" "$FINDINGS_JSON" >> "$LOG_FILE"
if [[ "$MODE" == "block" ]]; then
echo "🚫 Session blocked: resolve the findings above before committing."
echo " Set SCAN_MODE=warn to log without blocking, or add patterns to SECRETS_ALLOWLIST."
exit 1
else
echo "💡 Review the findings above. Set SCAN_MODE=block to prevent commits with secrets."
fi
else
echo "✅ No secrets detected in ${#FILES[@]} scanned file(s)"
printf '{"timestamp":"%s","event":"scan_complete","mode":"%s","scope":"%s","status":"clean","files_scanned":%d}\n' \
"$TIMESTAMP" "$MODE" "$SCOPE" "${#FILES[@]}" >> "$LOG_FILE"
fi
exit 0