Files
awesome-copilot/hooks/fix-broken-links/link-fix.ps1
T
2026-06-18 01:00:35 +00:00

411 lines
17 KiB
PowerShell

#!/usr/bin/env pwsh
# fix-broken-links - link-fix.ps1 (PowerShell 7+ port of link-fix.sh)
#
# After the agent edits files (postToolUse): take the files it just changed,
# extract every http(s) URL, and check each one.
# • With file paths passed (the edited files, injected from the hook payload, or
# given on the command line) any URL that is not 200 gets spelling variations
# (http/https, www, trailing slash) then a Copilot CLI agent hand-off for more
# alternatives, followed by an interactive menu to replace / remove / skip.
# • With NO file arguments it only lists the broken links - no alternative
# lookups and no prompts.
# Generic anchor text is flagged as an SEO note either way.
#
# Pure PowerShell + .NET (Invoke-WebRequest/regex), plus an optional Copilot CLI
# hand-off for suggestions.
# Covers: HTML · Markdown · JS/TS · JSON · CSS · SQL · templates (all via URL scan)
# Trigger: postToolUse
Set-StrictMode -Off
$ProgressPreference = 'SilentlyContinue' # Invoke-WebRequest is far faster without the bar
# The agent hand-off below invokes `copilot`, which may itself re-fire this hook.
# The child run is marked with this env var; exit immediately if it is present so
# we never recurse.
if ($env:FIX_BROKEN_LINKS_AGENT) { exit 0 }
$LIMIT = 50
$TIMEOUT = 10
$UA = 'Mozilla/5.0 (compatible; fix-broken-links/1.0)'
$AGENT_MODEL = 'gpt-5-mini' # small, low-token model for the suggestion hand-off
$AGENT_TIMEOUT = 60 # seconds before giving up on the agent
$WEB_RE = '\.(html?|xhtml|md|markdown|mdx|js|jsx|ts|tsx|vue|svelte|json|jsonl|css|sql|erb|jinja|j2|twig|ejs|pug|hbs)$'
# Positional args become the file list; the hook payload can also supply them.
$ScriptArgs = [System.Collections.Generic.List[string]]::new()
foreach ($a in $args) { [void]$ScriptArgs.Add([string]$a) }
# ── Hook stdin ────────────────────────────────────────────────────────────────
# When called as a postToolUse hook, extract edited files from the JSON payload
# and inject them as positional args so Get-InputFiles picks them up.
$IsHook = $false
if ($ScriptArgs.Count -eq 0 -and [Console]::IsInputRedirected) {
$IsHook = $true # invoked as a hook: stdin carries the tool payload
$raw = [Console]::In.ReadToEnd()
if ($raw.Trim()) {
try {
$json = $raw | ConvertFrom-Json
$tool = $json.toolName; if (-not $tool) { $tool = $json.tool_name }
if ($tool) {
if ($tool -in 'editFiles','edit','write','str_replace_editor','create_file','multiEdit','applyPatch') {
# Only the files this edit tool just changed - never a wider repo scan.
$hookFiles = $json.tool_input.files; if (-not $hookFiles) { $hookFiles = $json.toolInput.files }
if (-not $hookFiles) { $hookFiles = $json.tool_input.path; if (-not $hookFiles) { $hookFiles = $json.toolInput.path } }
if ($hookFiles) { foreach ($hf in $hookFiles) { [void]$ScriptArgs.Add([string]$hf) } }
}
else {
# Different tool (bash, read, etc.) - nothing to check
exit 0
}
}
# No tool context - called manually with piped input, fall through
} catch { }
}
}
# A non-empty positional list means the caller passed files: the edited files from
# the hook payload above, or paths given on the command line. Only then do we run
# the full repair flow (look up alternatives, then prompt to fix). With no
# parameters we simply list the broken links - no lookups, no prompts.
$HaveParams = $ScriptArgs.Count -gt 0
# Interactive prompts are only possible when input is a real console; once the
# hook JSON has been read from a redirected stdin we report rather than prompt.
$Interactive = [Environment]::UserInteractive -and -not [Console]::IsInputRedirected
function Read-Answer {
param([string]$Prompt)
if (-not $Interactive) { return '' }
[Console]::Out.Write($Prompt)
$ans = [Console]::In.ReadLine()
if ($null -eq $ans) { return '' }
return $ans
}
# ── Helpers ───────────────────────────────────────────────────────────────────
function Get-HttpStatus {
param([string]$Url)
try {
$resp = Invoke-WebRequest -Uri $Url -MaximumRedirection 5 -TimeoutSec $TIMEOUT `
-UserAgent $UA -ErrorAction Stop
return [string][int]$resp.StatusCode
} catch {
$resp = $_.Exception.Response
if ($resp -and $resp.StatusCode) { return [string][int]$resp.StatusCode }
return 'ERR'
}
}
# Split a URL into scheme/host/path the same way the bash port does (string ops,
# not [uri], so wildcards and odd paths survive intact).
function Split-Url {
param([string]$Url)
$scheme = ($Url -split '://',2)[0]
$rest = $Url -replace '^[a-zA-Z][a-zA-Z0-9+.-]*://',''
$hostName = ($rest -split '/',2)[0]
if ($rest -eq $hostName) { $path = '' } else { $path = '/' + ($rest -split '/',2)[1] }
[pscustomobject]@{ Scheme = $scheme; Host = $hostName; Path = $path }
}
# Every http(s) URL in a file, trailing punctuation trimmed, de-duplicated.
function Get-Urls {
param([string]$File)
$text = [System.IO.File]::ReadAllText($File)
[regex]::Matches($text, 'https?://[^"''<> )]+', 'IgnoreCase') |
ForEach-Object { $_.Value -replace '[.,;:]+$','' } |
Sort-Object -Unique
}
# Generic anchor text that weakens SEO.
function Get-SeoIssues {
param([string]$File)
$text = [System.IO.File]::ReadAllText($File)
$reA = '<a[^>]*>\s*(click here|click|here|read more|more|this page|this|learn more|see more|view|visit|details|info)\s*</a>'
$reB = '\[(click here|click|here|read more|more|this page|learn more|see more|details|info)\]\('
@([regex]::Matches($text, $reA, 'IgnoreCase')) +
@([regex]::Matches($text, $reB, 'IgnoreCase')) | ForEach-Object { $_.Value }
}
# Try common URL variations; return the first that returns 200, else ''.
function Find-Variation {
param([string]$Url)
$p = Split-Url $Url
$scheme = $p.Scheme; $hostName = $p.Host; $path = $p.Path
$cands = [System.Collections.Generic.List[string]]::new()
if ($scheme -eq 'http') { [void]$cands.Add("https://$hostName$path") }
if ($scheme -eq 'https') { [void]$cands.Add("http://$hostName$path") }
if ($hostName -like 'www.*') { [void]$cands.Add("$scheme`://$($hostName.Substring(4))$path") }
else { [void]$cands.Add("$scheme`://www.$hostName$path") }
if ($path -and $path -notmatch '/$' -and (($path -split '/')[-1]) -notmatch '\.') {
[void]$cands.Add(($Url -replace '/$','') + '/')
}
foreach ($c in $cands) {
if ($c -eq $Url) { continue }
if ((Get-HttpStatus $c) -eq '200') { return $c }
}
return ''
}
# Hand the broken link to the Copilot CLI agent and let it propose alternatives.
# A deliberately lightweight, low-token hand-off: one non-interactive prompt to a
# small model with no tools enabled (so it answers from its own knowledge - no web
# fetches, no permission prompts, no archive lookups on our side). The model may
# prefix a prose line, so we pull http(s) tokens from anywhere in the output, trim
# trailing punctuation, drop the broken URL itself, and de-duplicate. The call runs
# as a job so it can be capped at $AGENT_TIMEOUT seconds.
function Get-AgentAlts {
param([string]$Url,[int]$Max)
if (-not (Get-Command copilot -ErrorAction SilentlyContinue)) { return @() }
$snappy = $AGENT_TIMEOUT - 5
$prompt = "In under $snappy seconds, find up to $Max working alternative URLs for the broken link $Url. Hierarchically consider 1. Path and/or page spelling; 2. web.archive.org/wayback; 3. Redirects using redirect destination; 4. The context of the link's text; in order to resolve. Output only the URLs. One per line, and no: prose, numbering, markdown, backticks, special characters, post formatting."
$out = ''
try {
# FIX_BROKEN_LINKS_AGENT marks the child run so a re-entrant hook exits early.
$job = Start-Job -ScriptBlock {
param($Prompt, $Model)
$env:FIX_BROKEN_LINKS_AGENT = '1'
copilot -p $Prompt -s --no-color --model $Model --available-tools 2>$null
} -ArgumentList $prompt, $AGENT_MODEL
# Only read output from a job that completed cleanly; a failed/errored copilot
# run yields no alternatives.
if ((Wait-Job $job -Timeout $AGENT_TIMEOUT) -and $job.State -eq 'Completed') {
$out = (Receive-Job $job -ErrorAction SilentlyContinue | Out-String)
}
Remove-Job $job -Force -ErrorAction SilentlyContinue
} catch { $out = '' }
if (-not $out) { return @() }
$seen = @{}
$result = [System.Collections.Generic.List[string]]::new()
foreach ($m in [regex]::Matches($out, 'https?://[^\s"''<>)\]]+', 'IgnoreCase')) {
if ($result.Count -ge $Max) { break }
$u = $m.Value -replace '[.,;:]+$',''
$key = $u.ToLower()
if ($key -eq $Url.ToLower()) { continue }
if ($seen.ContainsKey($key)) { continue }
$seen[$key] = $true
[void]$result.Add($u)
}
return ,$result.ToArray()
}
# Up to MAX viable replacement URLs for a broken link, best first:
# 1. a working scheme/www/slash variation (verified live 200)
# 2. alternatives proposed by the Copilot CLI agent (see Get-AgentAlts)
# De-duplicated case-insensitively. The first item is what `r` uses; the rest
# become the numbered alternatives.
function Get-SuggestedAlts {
param([string]$Url,[int]$Max = 6)
$seen = @{}
$out = [System.Collections.Generic.List[string]]::new()
$v = Find-Variation $Url
if ($v) { [void]$out.Add($v); $seen[$v.ToLower()] = $true }
foreach ($a in (Get-AgentAlts $Url $Max)) {
if ($out.Count -ge $Max) { break }
if (-not $a) { continue }
$key = $a.ToLower()
if ($seen.ContainsKey($key)) { continue }
[void]$out.Add($a); $seen[$key] = $true
}
return ,$out.ToArray()
}
# Replace a literal URL everywhere in a file (plain string replace, no regex).
function Set-UrlReplacement {
param([string]$File,[string]$Old,[string]$New)
$content = [System.IO.File]::ReadAllText($File)
[System.IO.File]::WriteAllText($File, $content.Replace($Old, $New))
}
# Remove the link wrapper but keep the visible text:
# <a href="URL">text</a> -> text
# [text](URL) -> text
function Remove-LinkWrapper {
param([string]$File,[string]$Url)
$content = [System.IO.File]::ReadAllText($File)
$esc = [regex]::Escape($Url)
# Each element is parenthesized: the comma operator binds tighter than '+', so
# without the parens the three concatenations collapse into a single string and
# the array would hold one bogus pattern instead of three.
$patterns = @(
('<a[^>]*href="' + $esc + '"[^>]*>([^<]*)</a>'),
("<a[^>]*href='" + $esc + "'[^>]*>([^<]*)</a>"),
('\[([^\]]*)\]\(' + $esc + '[^)]*\)')
)
foreach ($pat in $patterns) {
$content = [regex]::Replace($content, $pat, '$1', 'IgnoreCase')
}
[System.IO.File]::WriteAllText($File, $content)
}
# ── File discovery ────────────────────────────────────────────────────────────
function Get-InputFiles {
if ($ScriptArgs.Count -gt 0) { return $ScriptArgs.ToArray() }
# Fired as a hook but the payload carried no (web) files: do nothing rather than
# fall back to scanning unrelated files - the hook only ever checks edited files.
if ($IsHook) { return @() }
$out = @()
if (Get-Command git -ErrorAction SilentlyContinue) {
git rev-parse --git-dir *> $null
if ($LASTEXITCODE -eq 0) {
$out = @(git diff --name-only HEAD 2>$null) + @(git diff --name-only --cached 2>$null)
}
}
if ($out.Count -gt 0) { return $out }
Get-ChildItem -Recurse -File -ErrorAction SilentlyContinue |
Where-Object { $_.FullName -notmatch '[\\/](\.git|node_modules|dist|build|\.next|\.venv|__pycache__)[\\/]' } |
ForEach-Object { Resolve-Path -Relative -LiteralPath $_.FullName }
}
$seenFiles = @{}
$FILES = [System.Collections.Generic.List[string]]::new()
foreach ($f in (Get-InputFiles)) {
if (-not $f) { continue }
$f = ([string]$f).Trim()
if (-not (Test-Path -LiteralPath $f -PathType Leaf)) { continue }
if ($f -match '[\\/](node_modules|\.git|dist|build)[\\/]') { continue }
if ($f -notmatch $WEB_RE) { continue }
if ($seenFiles.ContainsKey($f)) { continue }
$seenFiles[$f] = $true
[void]$FILES.Add($f)
}
if ($FILES.Count -eq 0) { exit 0 }
# ── Scan ──────────────────────────────────────────────────────────────────────
$B_FILE = [System.Collections.Generic.List[string]]::new()
$B_URL = [System.Collections.Generic.List[string]]::new()
$B_STATUS = [System.Collections.Generic.List[string]]::new()
$B_ALT = [System.Collections.Generic.List[object]]::new()
$SEO_LINES = [System.Collections.Generic.List[string]]::new()
foreach ($file in $FILES) {
foreach ($line in (Get-SeoIssues $file)) {
if ($line) { [void]$SEO_LINES.Add("${file}: $line") }
}
$urls = @(Get-Urls $file)
if ($urls.Count -eq 0) { continue }
if ($HaveParams -and $urls.Count -gt $LIMIT) {
$ans = Read-Answer " $file has $($urls.Count) links (limit $LIMIT). Continue? [Y/n] "
if ($ans -in 'n','N','no','NO') { continue }
}
Write-Host ""
Write-Host " Checking $($urls.Count) link(s) in $file ..."
foreach ($url in $urls) {
$status = Get-HttpStatus $url
if ($status -eq '200') { continue }
Write-Host " BROKEN ($status) $url"
# Only look up replacements when files were passed; otherwise just list.
$alts = @()
if ($HaveParams) { $alts = Get-SuggestedAlts $url 6 }
[void]$B_FILE.Add($file)
[void]$B_URL.Add($url)
[void]$B_STATUS.Add($status)
[void]$B_ALT.Add($alts)
}
}
# ── SEO report ────────────────────────────────────────────────────────────────
if ($SEO_LINES.Count -gt 0) {
Write-Host ""
Write-Host "------------------------------------------------------------"
Write-Host " SEO anchor issues (consider descriptive link text)"
foreach ($s in $SEO_LINES) { Write-Host " $s" }
}
if ($B_URL.Count -eq 0) {
Write-Host ""
Write-Host " No broken links found."
Write-Host ""
exit 0
}
# ── Interactive fix ───────────────────────────────────────────────────────────
Write-Host ""
Write-Host "============================================================"
Write-Host " fix-broken-links report"
Write-Host "============================================================"
$CHANGED = @{}
$n = $B_URL.Count
for ($i = 0; $i -lt $n; $i++) {
$file = $B_FILE[$i]
$url = $B_URL[$i]
$status = $B_STATUS[$i]
$alts = @($B_ALT[$i])
Write-Host ""
Write-Host " [$($i + 1)] $file"
Write-Host " URL : $url"
$note = ''
if ($status -in 'ERR','000','TIMEOUT') { $note = ' (unreachable)' }
Write-Host " HTTP: $status$note"
# No file parameters → report-only: list the broken link and move on.
if (-not $HaveParams) { continue }
Write-Host ""
if ($alts.Count -gt 0) {
Write-Host " r Replace -> $($alts[0])"
for ($k = 1; $k -lt $alts.Count; $k++) {
Write-Host " $k Replace -> $($alts[$k])"
}
}
Write-Host " d Remove link, keep text"
Write-Host " c Custom replacement URL"
Write-Host " s Skip"
if (-not $Interactive) {
Write-Host " (no terminal - reporting only)"
continue
}
while ($true) {
$ch = Read-Answer ' > '
if ($ch -eq 's' -or $ch -eq '') { break }
elseif ($ch -eq 'd') {
Remove-LinkWrapper $file $url; $CHANGED[$file] = $true; Write-Host " removed"; break
}
elseif ($ch -eq 'r') {
if ($alts.Count -gt 0) {
Set-UrlReplacement $file $url $alts[0]; $CHANGED[$file] = $true
Write-Host " replaced -> $($alts[0])"; break
}
Write-Host " no suggestion available"
}
elseif ($ch -match '^[1-9]$') {
$idx = [int]$ch
if ($idx -lt $alts.Count) {
Set-UrlReplacement $file $url $alts[$idx]; $CHANGED[$file] = $true
Write-Host " replaced -> $($alts[$idx])"; break
}
Write-Host " invalid choice"
}
elseif ($ch -eq 'c') {
$u = Read-Answer ' URL: '
if ($u) { Set-UrlReplacement $file $url $u; $CHANGED[$file] = $true; Write-Host " replaced"; break }
}
else {
Write-Host " invalid choice"
}
}
}
if ($CHANGED.Count -gt 0) {
Write-Host ""
Write-Host " $($CHANGED.Count) file(s) updated:"
foreach ($f in $CHANGED.Keys) { Write-Host " $f" }
Write-Host ""
}
exit 0