#!/usr/bin/env ts-node /** * Standalone Scrapling CLI runner — no n8n needed. * * Usage: * npx ts-node scripts/scrapling-run.ts [options] * * Options: * --fetcher http|stealth|dynamic (default: http) * --selector : (can be repeated) * --html include raw HTML in output * --wait wait for selector (dynamic only) * --timeout (default: 30000) * --format pretty|json (default: pretty) * --service service URL override * * Credentials (from .env.test or environment): * SCRAPLING_SERVICE_URL default: http://localhost:8765 * SCRAPLING_API_KEY optional * * Examples: * npx ts-node scripts/scrapling-run.ts https://example.com * npx ts-node scripts/scrapling-run.ts https://news.ycombinator.com --fetcher stealth --selector "title:title" * npx ts-node scripts/scrapling-run.ts https://spa.example.com --fetcher dynamic --wait "#app" */ import * as https from 'https'; import * as http from 'http'; import * as path from 'path'; import * as fs from 'fs'; // ── Load .env.test if present ───────────────────────────────────────────────── const envFile = path.resolve(__dirname, '..', '.env.test'); if (fs.existsSync(envFile)) { const lines = fs.readFileSync(envFile, 'utf-8').split('\n'); for (const line of lines) { const trimmed = line.trim(); if (!trimmed || trimmed.startsWith('#')) continue; const eq = trimmed.indexOf('='); if (eq === -1) continue; const key = trimmed.slice(0, eq).trim(); const value = trimmed.slice(eq + 1).trim().replace(/^["']|["']$/g, ''); if (!process.env[key]) process.env[key] = value; } } // ── Parse CLI args ──────────────────────────────────────────────────────────── const args = process.argv.slice(2); const url = args.find((a) => !a.startsWith('--')); if (!url) { console.error('Usage: scrapling-run.ts [--fetcher http|stealth|dynamic] [--selector name:selector] ...'); process.exit(1); } function getArg(flag: string): string | undefined { const idx = args.indexOf(flag); return idx !== -1 ? args[idx + 1] : undefined; } function getFlag(flag: string): boolean { return args.includes(flag); } function getArgs(flag: string): string[] { const result: string[] = []; for (let i = 0; i < args.length; i++) { if (args[i] === flag && args[i + 1]) { result.push(args[i + 1]); i++; } } return result; } const fetcherType = (getArg('--fetcher') ?? 'http') as 'http' | 'stealth' | 'dynamic'; const returnHtml = getFlag('--html'); const waitSelector = getArg('--wait'); const timeout = parseInt(getArg('--timeout') ?? '30000', 10); const outputFormat = (getArg('--format') ?? 'pretty') as 'pretty' | 'json'; const serviceUrl = (getArg('--service') ?? process.env.SCRAPLING_SERVICE_URL ?? 'http://localhost:8765').replace(/\/$/, ''); const apiKey = process.env.SCRAPLING_API_KEY ?? ''; const rawSelectors = getArgs('--selector'); const selectors = rawSelectors.map((raw) => { const colonIdx = raw.indexOf(':'); if (colonIdx === -1) { console.error(`Invalid selector format: "${raw}". Expected "name:selector"`); process.exit(1); } return { name: raw.slice(0, colonIdx), selector: raw.slice(colonIdx + 1), selector_type: 'css' as const, multiple: false, }; }); // ── Minimal HTTP client ─────────────────────────────────────────────────────── function postJson(reqUrl: string, body: unknown, headers: Record): Promise { return new Promise((resolve, reject) => { const bodyStr = JSON.stringify(body); const parsed = new URL(reqUrl); const isHttps = parsed.protocol === 'https:'; const transport = isHttps ? https : http; const req = transport.request( { hostname: parsed.hostname, port: parsed.port || (isHttps ? 443 : 80), path: parsed.pathname + parsed.search, method: 'POST', headers: { 'Content-Type': 'application/json', 'Content-Length': Buffer.byteLength(bodyStr).toString(), ...headers, }, }, (res) => { const chunks: Buffer[] = []; res.on('data', (c: Buffer) => chunks.push(c)); res.on('end', () => { const text = Buffer.concat(chunks).toString('utf-8'); if (res.statusCode && res.statusCode >= 400) { reject(Object.assign(new Error(`HTTP ${res.statusCode}: ${text}`), { statusCode: res.statusCode })); } else { try { resolve(JSON.parse(text)); } catch { resolve(text); } } }); }, ); req.on('error', reject); req.write(bodyStr); req.end(); }); } // ── Main ────────────────────────────────────────────────────────────────────── interface ScrapeResponse { url: string; status_code: number; html?: string; data: Record; fetcher_used: string; elapsed_ms: number; error?: string; } async function main(): Promise { console.log(`Service: ${serviceUrl}`); console.log(`URL: ${url}`); console.log(`Fetcher: ${fetcherType}`); if (selectors.length) console.log(`Selectors: ${selectors.map((s) => `${s.name}:${s.selector}`).join(', ')}`); console.log(); const payload: Record = { url, fetcher_type: fetcherType, return_html: returnHtml, timeout, selectors, }; if (waitSelector) payload.wait_selector = waitSelector; const requestHeaders: Record = {}; if (apiKey) requestHeaders['X-API-Key'] = apiKey; const response = (await postJson(`${serviceUrl}/scrape`, payload, requestHeaders)) as ScrapeResponse; if (outputFormat === 'json') { console.log(JSON.stringify(response, null, 2)); return; } // Pretty output console.log(`Status: ${response.status_code}`); console.log(`Fetcher: ${response.fetcher_used}`); console.log(`Elapsed: ${response.elapsed_ms}ms`); if (response.error) { console.error(`\nError: ${response.error}`); return; } if (Object.keys(response.data).length > 0) { console.log('\nExtracted data:'); console.log('─'.repeat(50)); for (const [key, val] of Object.entries(response.data)) { const display = Array.isArray(val) ? `[${(val as unknown[]).length} items]` : String(val); console.log(` ${key.padEnd(25)} ${display}`); } } if (response.html) { console.log('\nHTML preview (first 500 chars):'); console.log('─'.repeat(50)); console.log(response.html.slice(0, 500)); } } main().catch((err) => { console.error('\nError:', (err as Error).message ?? err); process.exit(1); });